From 6e74fd4945f1b0ccf72fda35daeedc8d6e632d0f Mon Sep 17 00:00:00 2001 From: Alex Wu Date: Mon, 28 Apr 2025 15:15:58 -0700 Subject: [PATCH 001/461] Support loading transformers models with named parameters (#16868) Signed-off-by: Alex --- vllm/model_executor/models/transformers.py | 23 ++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index a37e88a38..ad7c07dc8 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -166,6 +166,9 @@ class TransformersModel(nn.Module): # Initialize buffers (e.g. rotary embedding inverse frequency) self.init_buffers(self.model) + # Initialize parameters + self.init_parameters(self.model) + # Move remaining meta tensors to device (should happen last) self.meta_to_empty(self.model) @@ -298,6 +301,25 @@ class TransformersModel(nn.Module): for child in module.children(): self.init_buffers(child) + def init_parameters(self, module: nn.Module): + """ + If a `parameter` is on the `meta` device, then its parent + `module` is the original module created by: + + ```python + with torch.device("meta"): + self.model: PreTrainedModel = AutoModel.from_config(...) + ``` + """ + for name, param in module.named_parameters(recurse=False): + if param.device == torch.device("meta"): + new_param = nn.Parameter( + torch.empty_like(param.data, + device=self.device_config.device)) + setattr(module, name, new_param) + for child in module.children(): + self.init_parameters(child) + def meta_to_empty(self, module: nn.Module): tensors = list(chain(module.buffers(), module.parameters())) if tensors and all(t.device == torch.device("meta") for t in tensors): @@ -342,6 +364,7 @@ class TransformersModel(nn.Module): def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) + loaded_params = set[str]() for name, loaded_weight in weights: # Use "model" instead of base_model_prefix because -- GitLab From 8fc88d63f1163f119dd740b1666069535f052ff3 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 28 Apr 2025 16:20:24 -0600 Subject: [PATCH 002/461] [Model] Add tuned triton fused_moe configs for Qwen3Moe (#17328) Signed-off-by: mgoin --- benchmarks/kernels/benchmark_moe.py | 8 +- ...192,device_name=NVIDIA_H100_80GB_HBM3.json | 146 ++++++++++++++++++ .../E=128,N=192,device_name=NVIDIA_H20.json | 146 ++++++++++++++++++ .../E=128,N=192,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ .../E=128,N=384,device_name=NVIDIA_H20.json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ .../E=128,N=384,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ ...512,device_name=NVIDIA_H100_80GB_HBM3.json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ .../E=128,N=768,device_name=NVIDIA_H20.json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ .../E=128,N=768,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ .../E=128,N=96,device_name=NVIDIA_H20.json | 146 ++++++++++++++++++ .../layers/fused_moe/configs/README | 3 +- 15 files changed, 1904 insertions(+), 5 deletions(-) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 912470fad..a274537a6 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -527,7 +527,7 @@ def get_weight_block_size_safety(config, default_value=None): def main(args: argparse.Namespace): print(args) - block_quant_shape = None + config = AutoConfig.from_pretrained( args.model, trust_remote_code=args.trust_remote_code) if config.architectures[0] == "DbrxForCausalLM": @@ -546,8 +546,9 @@ def main(args: argparse.Namespace): topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size shard_intermediate_size = 2 * intermediate_size // args.tp_size - block_quant_shape = get_weight_block_size_safety(config) - elif config.architectures[0] == "Qwen2MoeForCausalLM": + elif config.architectures[0] in [ + "Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM" + ]: E = config.num_experts topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size @@ -565,6 +566,7 @@ def main(args: argparse.Namespace): dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_int8_w8a16 = args.dtype == "int8_w8a16" + block_quant_shape = get_weight_block_size_safety(config) if args.batch_size is None: batch_sizes = [ diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 000000000..5de5605d4 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json new file mode 100644 index 000000000..2221e99cd --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json new file mode 100644 index 000000000..74374c573 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000..b34b6e4e8 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json new file mode 100644 index 000000000..ab169a018 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000..324ad7b22 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json new file mode 100644 index 000000000..ab6e15552 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 000000000..249359fb9 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000..b4efc9b7e --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json new file mode 100644 index 000000000..03dfc73b6 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000..9c07695ba --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json new file mode 100644 index 000000000..beaac7f64 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json new file mode 100644 index 000000000..ebff99e26 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/README b/vllm/model_executor/layers/fused_moe/configs/README index 787bd0611..85970e2d1 100644 --- a/vllm/model_executor/layers/fused_moe/configs/README +++ b/vllm/model_executor/layers/fused_moe/configs/README @@ -9,5 +9,4 @@ The example configurations provided are for the Mixtral model for TP2 on H100 and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have N = 7168 and for TP4 we have N = 3584. -Please feel free to tune the configurations using scripts in `benchmarks/kernels/benchmark_moe.py` -Some of the configurations files are copied from the SGLang repository. Thank you! +See `benchmark/kernels/benchmark_moe.py` on how to generate these config files. -- GitLab From cfe45320931b7a1111ab7179a0ee473907b3f5ef Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Mon, 28 Apr 2025 19:46:15 -0400 Subject: [PATCH 003/461] [Benchmark] Add single turn MTBench to Serving Bench (#17202) --- benchmarks/benchmark_dataset.py | 54 +++++++++++++++++++++++++++++++++ benchmarks/benchmark_serving.py | 9 ++++-- 2 files changed, 60 insertions(+), 3 deletions(-) diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index ccbc6c022..9c614baf1 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -771,6 +771,60 @@ class InstructCoderDataset(HuggingFaceDataset): return sampled_requests +# ----------------------------------------------------------------------------- +# MT-Bench Dataset Implementation +# ----------------------------------------------------------------------------- + + +class MTBenchDataset(HuggingFaceDataset): + """ + MT-Bench Dataset. + https://huggingface.co/datasets/philschmid/mt-bench + + We create a single turn dataset for MT-Bench. + This is similar to Spec decoding benchmark setup in vLLM + https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18 + """ # noqa: E501 + + DEFAULT_OUTPUT_LEN = 256 # avg len used in SD bench in vLLM + SUPPORTED_DATASET_PATHS = { + "philschmid/mt-bench", + } + + def sample(self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + **kwargs) -> list: + output_len = (output_len + if output_len is not None else self.DEFAULT_OUTPUT_LEN) + sampled_requests = [] + + for item in self.data: + if len(sampled_requests) >= num_requests: + break + prompt = item['turns'][0] + + # apply template + prompt = tokenizer.apply_chat_template([{ + "role": "user", + "content": prompt + }], + add_generation_prompt=True, + tokenize=False) + + prompt_len = len(tokenizer(prompt).input_ids) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + )) + self.maybe_oversample_requests(sampled_requests, num_requests) + return sampled_requests + + # ----------------------------------------------------------------------------- # AIMO Dataset Implementation # ----------------------------------------------------------------------------- diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index da124e1a8..c236d6426 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -52,9 +52,9 @@ except ImportError: from benchmark_dataset import (AIMODataset, ASRDataset, BurstGPTDataset, ConversationDataset, HuggingFaceDataset, - InstructCoderDataset, RandomDataset, - SampleRequest, ShareGPTDataset, SonnetDataset, - VisionArenaDataset) + InstructCoderDataset, MTBenchDataset, + RandomDataset, SampleRequest, ShareGPTDataset, + SonnetDataset, VisionArenaDataset) from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json MILLISECONDS_TO_SECONDS_CONVERSION = 1000 @@ -595,6 +595,9 @@ def main(args: argparse.Namespace): elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS: dataset_class = InstructCoderDataset args.hf_split = "train" + elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS: + dataset_class = MTBenchDataset + args.hf_split = "train" elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS: dataset_class = ConversationDataset elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS: -- GitLab From 506475de5f1a48c225e7cd7f87efd40ad6f3c9c0 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 29 Apr 2025 09:40:35 +0800 Subject: [PATCH 004/461] [Optim] Compute multimodal hash only once per item (#17314) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/deepseek_vl2.py | 16 +- vllm/model_executor/models/h2ovl.py | 16 +- vllm/model_executor/models/llava.py | 3 - vllm/model_executor/models/mistral3.py | 2 - vllm/model_executor/models/pixtral.py | 15 +- vllm/multimodal/processing.py | 309 ++++++++++++++------- 6 files changed, 233 insertions(+), 128 deletions(-) diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index ac136698e..6d8f27530 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -22,8 +22,8 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, - BaseProcessingInfo, PromptReplacement, - PromptUpdate) + BaseProcessingInfo, MultiModalHashes, + PromptReplacement, PromptUpdate) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config, @@ -279,24 +279,26 @@ class DeepseekVL2MultiModalProcessor( prompt: Union[str, list[int]], mm_data_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - ) -> tuple[list[int], MultiModalKwargs, bool]: + *, + return_mm_hashes: bool, + ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]: # The processor logic is different for len(images) <= 2 vs > 2 # Since the processing cache assumes that the processor output is # invariant of how many images are passed per prompt, we only # perform caching for the most common case if mm_data_items.get_count("image", strict=False) > 2: - # This code path corresponds to the cache being disabled - return self._apply_hf_processor_main( + return self._apply_hf_processor( prompt=prompt, - mm_items=mm_data_items, + mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, - enable_hf_prompt_update=True, + return_mm_hashes=return_mm_hashes, ) return super()._cached_apply_hf_processor( prompt=prompt, mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, + return_mm_hashes=return_mm_hashes, ) diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index 15e126b0f..99c226439 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -19,8 +19,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalKwargs from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, MultiModalDataItems) -from vllm.multimodal.processing import (PromptReplacement, PromptUpdate, - PromptUpdateDetails) +from vllm.multimodal.processing import (MultiModalHashes, PromptReplacement, + PromptUpdate, PromptUpdateDetails) from vllm.transformers_utils.tokenizer import AnyTokenizer from .intern_vit import InternVisionModel @@ -488,24 +488,26 @@ class H2OVLMultiModalProcessor(InternVLMultiModalProcessor[H2OVLProcessingInfo] prompt: Union[str, list[int]], mm_data_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - ) -> tuple[list[int], MultiModalKwargs, bool]: + *, + return_mm_hashes: bool, + ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]: # The processor logic is different for len(images) <= 1 vs > 1 # Since the processing cache assumes that the processor output is # invariant of how many images are passed per prompt, we only # perform caching for the most common case if mm_data_items.get_count("image", strict=False) > 1: - # This code path corresponds to the cache being disabled - return self._apply_hf_processor_main( + return self._apply_hf_processor( prompt=prompt, - mm_items=mm_data_items, + mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, - enable_hf_prompt_update=True, + return_mm_hashes=return_mm_hashes, ) return super()._cached_apply_hf_processor( prompt=prompt, mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, + return_mm_hashes=return_mm_hashes, ) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 8862b2679..16f5327ee 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -396,14 +396,12 @@ def _build_llava_or_pixtral_hf_processor( dummy_inputs: BaseDummyInputsBuilder[_I], *, cache: Optional[ProcessingCache] = None, - enable_sanity_checks: bool = True, ) -> BaseMultiModalProcessor: if isinstance(info, PixtralHFProcessingInfo): return PixtralHFMultiModalProcessor( info, dummy_inputs, # type: ignore cache=cache, - enable_sanity_checks=enable_sanity_checks, ) if isinstance(info, LlavaProcessingInfo): @@ -411,7 +409,6 @@ def _build_llava_or_pixtral_hf_processor( info, dummy_inputs, # type: ignore cache=cache, - enable_sanity_checks=enable_sanity_checks, ) raise NotImplementedError(type(info)) diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index f8e9e3181..12c87dc0f 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -312,14 +312,12 @@ def _build_mistral3_processor( dummy_inputs: BaseDummyInputsBuilder[_I], *, cache: Optional[ProcessingCache] = None, - enable_sanity_checks: bool = True, ) -> BaseMultiModalProcessor: assert isinstance(info, Mistral3ProcessingInfo) return Mistral3MultiModalProcessor( info, dummy_inputs, # type: ignore cache=cache, - enable_sanity_checks=enable_sanity_checks, ) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 73fd80146..d756b3b8a 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -36,8 +36,9 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, - BaseProcessingInfo, PromptReplacement, - PromptUpdate, PromptUpdateDetails) + BaseProcessingInfo, MultiModalHashes, + PromptReplacement, PromptUpdate, + PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.transformers_utils.tokenizer import (MistralTokenizer, @@ -271,15 +272,19 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo] prompt: Union[str, list[int]], mm_data_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - ) -> tuple[list[int], MultiModalKwargs, bool]: - prompt_ids, mm_kwargs, _ = super()._cached_apply_hf_processor( + *, + return_mm_hashes: bool, + ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]: + prompt_ids, mm_kwargs, mm_hashes, _ = super( + )._cached_apply_hf_processor( prompt=prompt, mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, + return_mm_hashes=return_mm_hashes, ) # NOTE: The tokens are already inserted by the chat template - return prompt_ids, mm_kwargs, True + return prompt_ids, mm_kwargs, mm_hashes, True @MULTIMODAL_REGISTRY.register_processor(PixtralMultiModalProcessor, diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 87131122e..d6ba8f1bc 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -876,6 +876,16 @@ def find_mm_placeholders( _V = TypeVar("_V", bound="Union[MultiModalKwargs, MultiModalKwargsItem]") +class ProcessingCacheOptionalItem(NamedTuple): + key: str + value: Optional[MultiModalKwargsItem] + + +class ProcessingCacheItem(NamedTuple): + key: str + value: MultiModalKwargsItem + + class ProcessingCache: @staticmethod @@ -980,6 +990,22 @@ class ProcessingCache: return self._cache.get(cache_key) + def get_item( + self, + model_id: str, + modality: str, + input_item: object, + input_kwargs: Mapping[str, object], + ) -> ProcessingCacheOptionalItem: + cache_key = MultiModalHasher.hash_kwargs(model_id=model_id, + **{modality: input_item}, + **input_kwargs) + + return ProcessingCacheOptionalItem( + key=cache_key, + value=self._cache.get(cache_key), + ) + def put( self, model_id: str, @@ -997,6 +1023,9 @@ class ProcessingCache: **input_kwargs) self._cache[cache_key] = output_kwargs + def put_item(self, item: ProcessingCacheItem) -> None: + self._cache[item.key] = item.value + class BaseProcessingInfo: """Base class to provide the information necessary for data processing.""" @@ -1052,6 +1081,11 @@ class BaseProcessingInfo: _I = TypeVar("_I", bound=BaseProcessingInfo) +MultiModalHashes = dict[str, list[str]] +""" +A collection of hashes with a similar structure as :class:`MultiModalKwargs`. +""" + class BaseMultiModalProcessor(ABC, Generic[_I]): """ @@ -1064,14 +1098,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): info: _I, dummy_inputs: "BaseDummyInputsBuilder[_I]", *, - cache: Optional[ProcessingCache] = None, - enable_sanity_checks: bool = True) -> None: + cache: Optional[ProcessingCache] = None) -> None: super().__init__() self.info = info self.dummy_inputs = dummy_inputs self.cache = cache - self.enable_sanity_checks = enable_sanity_checks self.data_parser = self._get_data_parser() @@ -1340,46 +1372,144 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): return prompt_ids, mm_kwargs, False + def _get_cache_missing_items( + self, + cache: ProcessingCache, + mm_data_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> tuple[dict[str, list[ProcessingCacheOptionalItem]], dict[ + str, list[object]]]: + model_id = self.info.model_id + + mm_cache_items = { + modality: [ + cache.get_item(model_id, modality, item, + hf_processor_mm_kwargs) for item in items + ] + for modality, items in mm_data_items.items() + } + + mm_missing_idxs = { + modality: [ + idx for idx, item in enumerate(cache_items) + if item.value is None + ] + for modality, cache_items in mm_cache_items.items() + } + mm_missing_data = { + modality: [mm_data_items[modality][idx] for idx in idxs] + for modality, idxs in mm_missing_idxs.items() + } + + return mm_cache_items, mm_missing_data + + def _hash_mm_items( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalHashes: + """Create MM hashes to be returned (only used in V1).""" + model_id = self.info.model_id + + return { + modality: [ + MultiModalHasher.hash_kwargs(model_id=model_id, + **{modality: item}, + **hf_processor_mm_kwargs) + for item in items + ] + for modality, items in mm_items.items() + } + + def _merge_mm_kwargs( + self, + cache: ProcessingCache, + mm_cache_items: dict[str, list[ProcessingCacheOptionalItem]], + mm_missing_data: dict[str, list[object]], + mm_missing_kwargs: MultiModalKwargs, + ) -> dict[str, list[ProcessingCacheItem]]: + mm_missing_next_idx = {modality: 0 for modality in mm_missing_data} + + merged_items = defaultdict[str, list[ProcessingCacheItem]](list) + for modality, cache_items in mm_cache_items.items(): + for cache_item in cache_items: + if cache_item.value is None: + kw_item = mm_missing_kwargs.get_item( + modality, + mm_missing_next_idx[modality], + ) + cache_item_new = ProcessingCacheItem( + key=cache_item.key, + value=kw_item, + ) + + cache.put_item(cache_item_new) + mm_missing_next_idx[modality] += 1 + else: + cache_item_new = ProcessingCacheItem( + key=cache_item.key, + value=cache_item.value, + ) + + merged_items[modality].append(cache_item_new) + + return dict(merged_items) + + def _apply_hf_processor( + self, + prompt: Union[str, list[int]], + mm_data_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + *, + return_mm_hashes: bool, + ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]: + ( + prompt_ids, + mm_kwargs, + is_update_applied, + ) = self._apply_hf_processor_main( + prompt=prompt, + mm_items=mm_data_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + enable_hf_prompt_update=True, + ) + + mm_hashes = (self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs) + if return_mm_hashes else None) + + return prompt_ids, mm_kwargs, mm_hashes, is_update_applied + def _cached_apply_hf_processor( self, prompt: Union[str, list[int]], mm_data_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - ) -> tuple[list[int], MultiModalKwargs, bool]: + *, + return_mm_hashes: bool, + ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]: """ Apply the HF processor on the full prompt text, caching the results and reusing cached results. """ cache = self.cache - model_id = self.info.model_id _, passthrough_data = self._get_hf_mm_data(mm_data_items) if cache is None or passthrough_data: - return self._apply_hf_processor_main( + return self._apply_hf_processor( prompt=prompt, - mm_items=mm_data_items, + mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, - enable_hf_prompt_update=True, + return_mm_hashes=return_mm_hashes, ) - mm_maybe_cached_kw_items = { - modality: [ - cache.get(model_id, modality, item, hf_processor_mm_kwargs) - for item in items - ] - for modality, items in mm_data_items.items() - } - - mm_missing_idxs = { - modality: - [idx for idx, item in enumerate(kw_items) if item is None] - for modality, kw_items in mm_maybe_cached_kw_items.items() - } - mm_missing_data = { - modality: [mm_data_items[modality][idx] for idx in idxs] - for modality, idxs in mm_missing_idxs.items() - } - mm_missing_data_items = self._to_mm_items(mm_missing_data) + ( + mm_cache_items, + mm_missing_data, + ) = self._get_cache_missing_items( + cache=cache, + mm_data_items=mm_data_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + ) # NOTE: `prompt` does not correspond to `mm_missing_data_items`, # so we can't apply prompt updates until the new multimodal @@ -1390,48 +1520,29 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): is_update_applied, ) = self._apply_hf_processor_main( prompt=prompt, - mm_items=mm_missing_data_items, + mm_items=self._to_mm_items(mm_missing_data), hf_processor_mm_kwargs=hf_processor_mm_kwargs, enable_hf_prompt_update=False, ) - mm_missing_next_idx = { - modality: 0 - for modality in mm_missing_data_items - } - - merged_kw_items = list[MultiModalKwargsItem]() - for modality, kw_items in mm_maybe_cached_kw_items.items(): - for idx, kw_item in enumerate(kw_items): - if kw_item is None: - kw_item = mm_missing_kwargs.get_item( - modality, - mm_missing_next_idx[modality], - ) - - cache.put( - model_id, - modality, - mm_data_items[modality][idx], - hf_processor_mm_kwargs, - kw_item, - ) - - mm_missing_next_idx[modality] += 1 - - merged_kw_items.append(kw_item) + mm_cache_items_merged = self._merge_mm_kwargs( + cache, + mm_cache_items=mm_cache_items, + mm_missing_data=mm_missing_data, + mm_missing_kwargs=mm_missing_kwargs, + ) - if self.enable_sanity_checks: - mm_missing_counts = mm_missing_data_items.get_all_counts() - assert all( - item_count == mm_missing_counts[modality] - for modality, item_count in mm_missing_next_idx.items()), dict( - mm_missing_next_idx=mm_missing_next_idx, - mm_missing_counts=mm_missing_counts) + mm_kwargs = MultiModalKwargs.from_items([ + item.value for cache_items in mm_cache_items_merged.values() + for item in cache_items + ]) - mm_kwargs = MultiModalKwargs.from_items(merged_kw_items) + mm_hashes = { + modality: [item.key for item in cache_items] + for modality, cache_items in mm_cache_items_merged.items() + } if return_mm_hashes else None - return prompt_ids, mm_kwargs, is_update_applied + return prompt_ids, mm_kwargs, mm_hashes, is_update_applied def _bind_and_group_updates( self, @@ -1569,27 +1680,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): "model (usually arising from an inconsistency between " "`_call_hf_processor` and `_get_prompt_updates`).") - def _hash_mm_items( - self, - mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> dict[str, list[str]]: - """Create MM hashes to be returned (only used in V1).""" - - # TODO: Use these hash keys for caching operations in apply_hf_processor - # instead of rehashing. - model_id = self.info.model_id - - return { - modality: [ - MultiModalHasher.hash_kwargs(model_id=model_id, - **{modality: item}, - **hf_processor_mm_kwargs) - for item in items - ] - for modality, items in mm_items.items() - } - def _maybe_apply_prompt_updates( self, mm_items: MultiModalDataItems, @@ -1655,17 +1745,16 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): """ mm_items = self._to_mm_items(mm_data) - mm_hashes = (self._hash_mm_items(mm_items, hf_processor_mm_kwargs) - if return_mm_hashes else None) - ( prompt_ids, mm_kwargs, + mm_hashes, is_update_applied, ) = self._cached_apply_hf_processor( prompt, mm_items, hf_processor_mm_kwargs, + return_mm_hashes=return_mm_hashes, ) prompt_ids, prompt, mm_placeholders = self._maybe_apply_prompt_updates( @@ -1717,28 +1806,12 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]): """Create input prompt for the decoder.""" return prompt - def apply( + def _get_enc_dec_inputs( self, prompt: Union[str, list[int]], mm_data: MultiModalDataDict, - hf_processor_mm_kwargs: Mapping[str, object], - return_mm_hashes: bool = False, - ) -> MultiModalEncDecInputs: - """ - Process multi-modal inputs to be used in vLLM. - The main processing steps are modified to fit encoder-decoder model: - 1. Create encoder prompt from input prompt text. - 2. Apply the HF processor on encoder prompt. - 3. Copy the input prompt text as decoder prompt inputs. - """ - encoder_prompt = self.create_encoder_prompt(prompt, mm_data) - encoder_inputs = super().apply( - encoder_prompt, - mm_data, - hf_processor_mm_kwargs, - return_mm_hashes, - ) - + encoder_inputs: MultiModalInputs, + ): tokenizer = self.info.get_tokenizer() decoder_prompt = self.create_decoder_prompt(prompt, mm_data) if isinstance(decoder_prompt, str): @@ -1758,3 +1831,31 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]): "prompt_token_ids": decoder_prompt_ids }) return mm_inputs + + def apply( + self, + prompt: Union[str, list[int]], + mm_data: MultiModalDataDict, + hf_processor_mm_kwargs: Mapping[str, object], + return_mm_hashes: bool = False, + ) -> MultiModalEncDecInputs: + """ + Process multi-modal inputs to be used in vLLM. + The main processing steps are modified to fit encoder-decoder model: + 1. Create encoder prompt from input prompt text. + 2. Apply the HF processor on encoder prompt. + 3. Copy the input prompt text as decoder prompt inputs. + """ + encoder_prompt = self.create_encoder_prompt(prompt, mm_data) + encoder_inputs = super().apply( + encoder_prompt, + mm_data, + hf_processor_mm_kwargs, + return_mm_hashes, + ) + + return self._get_enc_dec_inputs( + prompt=prompt, + mm_data=mm_data, + encoder_inputs=encoder_inputs, + ) -- GitLab From 86d9fc29cb39bbb09b5ac3202293eb6198666f18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Moskal?= Date: Mon, 28 Apr 2025 19:21:32 -0700 Subject: [PATCH 005/461] implement Structural Tag with Guidance backend (#17333) Signed-off-by: Michal Moskal --- .../llm/test_struct_output_generate.py | 11 +++---- vllm/v1/structured_output/backend_guidance.py | 31 +++++++++++++++++-- 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index 19960c13c..3de4fec9c 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -435,13 +435,10 @@ Given the previous instructions, what is the weather in New York City? """ # Change this once other backends support structural_tag - if guided_decoding_backend.startswith("xgrammar"): - outputs = llm.generate(prompts=prompt, - sampling_params=sampling_params, - use_tqdm=True) - assert outputs is not None - else: - outputs = [] + outputs = llm.generate(prompts=prompt, + sampling_params=sampling_params, + use_tqdm=True) + assert outputs is not None for output in outputs: assert output is not None diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py index 1453e284b..d4dc5e681 100644 --- a/vllm/v1/structured_output/backend_guidance.py +++ b/vllm/v1/structured_output/backend_guidance.py @@ -173,7 +173,8 @@ def serialize_guidance_grammar( disable_any_whitespace: bool = False, no_additional_properties: bool = False, ) -> str: - if request_type == StructuredOutputOptions.JSON: + + def _process_schema(grammar_spec: Union[str, dict[str, Any]], ) -> str: if no_additional_properties: grammar_spec = process_for_additional_properties(grammar_spec) return llguidance.LLMatcher.grammar_from_json_schema( @@ -181,6 +182,9 @@ def serialize_guidance_grammar( defaults={ "whitespace_flexible": not disable_any_whitespace, }) + + if request_type == StructuredOutputOptions.JSON: + return _process_schema(grammar_spec) elif request_type == StructuredOutputOptions.JSON_OBJECT: return llguidance.LLMatcher.grammar_from_json_schema( '{"type": "object"}', @@ -195,8 +199,29 @@ def serialize_guidance_grammar( elif request_type == StructuredOutputOptions.CHOICE: tp = "choice" elif request_type == StructuredOutputOptions.STRUCTURAL_TAG: - raise ValueError("Structural tag is not supported " - "for guidance backend yet") + if isinstance(grammar_spec, str): + s_tag = json.loads(grammar_spec) + else: + s_tag = grammar_spec + triggers: list[str] = s_tag["triggers"] + tags: list[llguidance.StructTag] = [] + for s in s_tag["structures"]: + begin: str = s["begin"] + trig = next((t for t in triggers if begin.startswith(t)), None) + if trig is None: + raise ValueError( + f"Trigger {begin} not found in triggers {triggers}") + tags.append( + llguidance.StructTag( + trigger=trig, + begin=s["begin"], + grammar=_process_schema(s["schema"]), + end=s["end"], + )) + if not tags: + raise ValueError( + "No structural tags found in the grammar spec.") + return llguidance.StructTag.to_grammar(tags) else: logger.error("Validation should have already occurred. " "Please file an issue.") -- GitLab From e1360005956d4887d101bbd675b7b0574a0afc45 Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Mon, 28 Apr 2025 22:22:02 -0400 Subject: [PATCH 006/461] [V1][Spec Decode] Make Eagle model arch config driven (#17323) --- vllm/config.py | 3 ++- vllm/transformers_utils/configs/eagle.py | 19 ++++++++++++++++++- vllm/v1/spec_decode/eagle.py | 17 ++++++----------- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index e64510355..3ed1674b5 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2401,7 +2401,8 @@ class SpeculativeConfig: pass else: eagle_config = EAGLEConfig( - self.draft_model_config.hf_config) + self.draft_model_config.hf_config, + method=self.method) self.draft_model_config.hf_config = eagle_config if (self.num_speculative_tokens is not None diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py index 3a9ad3e0f..586d5c7f5 100644 --- a/vllm/transformers_utils/configs/eagle.py +++ b/vllm/transformers_utils/configs/eagle.py @@ -15,6 +15,7 @@ class EAGLEConfig(PretrainedConfig): def __init__(self, model: Union[PretrainedConfig, dict, None] = None, truncated_vocab_size: Optional[int] = None, + method: Optional[str] = 'eagle', **kwargs): model_config: Union[PretrainedConfig, DeepseekV2Config, None] @@ -45,7 +46,23 @@ class EAGLEConfig(PretrainedConfig): if not envs.VLLM_USE_V1: kwargs["architectures"] = ["EAGLEModel"] else: - kwargs["architectures"] = ["EagleLlamaForCausalLM"] + # Eagle model name should follow naming convention of + # LlamaForCausalLM -> EagleLlamaForCausalLM + if method == "eagle": + assert self.model is not None, \ + "model should not be None when method is eagle" + kwargs["architectures"] = [ + f"Eagle{arch}" for arch in self.model.architectures + ] + elif method == "eagle3": + assert self.model is not None, \ + "model should not be None when method is eagle3" + kwargs["architectures"] = [ + f"Eagle3{arch}" for arch in self.model.architectures + ] + else: + raise ValueError(f"Invalid method {method}. \ + Supported methods are eagle and eagle3.") super().__init__(**kwargs) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 1de14584d..8c45ca9a3 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -9,8 +9,7 @@ from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.model_loader.loader import get_model_loader from vllm.model_executor.model_loader.utils import set_default_torch_dtype -from vllm.model_executor.models.llama_eagle import EagleLlamaForCausalLM -from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM +from vllm.model_executor.models import ModelRegistry from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata from vllm.v1.sample.metadata import SamplingMetadata @@ -225,15 +224,11 @@ class EagleProposer: with set_default_torch_dtype( draft_model_config.dtype), set_current_vllm_config( self.vllm_config): - if self.vllm_config.speculative_config.method == "eagle": - self.model = EagleLlamaForCausalLM( - model_config=draft_model_config, - start_layer_id=target_layer_num).to(target_device) - else: - assert self.vllm_config.speculative_config.method == "eagle3" - self.model = Eagle3LlamaForCausalLM( - model_config=draft_model_config, - start_layer_id=target_layer_num).to(target_device) + draft_model_cls, arch = ModelRegistry.resolve_model_cls( + draft_model_config.architectures) + self.model = draft_model_cls( + model_config=draft_model_config, + start_layer_id=target_layer_num).to(target_device) loaded_weights = self.model.load_weights( loader.get_all_weights( -- GitLab From b4ac4fa04da14c4f40688fb98211367981a1f4d7 Mon Sep 17 00:00:00 2001 From: Lucia Fang <116399278+luccafong@users.noreply.github.com> Date: Mon, 28 Apr 2025 19:22:22 -0700 Subject: [PATCH 007/461] [model] make llama4 compatible with pure dense layers (#17315) Signed-off-by: Lucia Fang --- vllm/model_executor/models/llama4.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index e5d1a671f..0fdc30f36 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -273,8 +273,8 @@ class Llama4DecoderLayer(nn.Module): cache_config=cache_config, prefix=f"{prefix}.self_attn", ) - is_moe_layer = (self.layer_idx + - 1) % config.interleave_moe_layer_step == 0 + is_moe_layer = config.interleave_moe_layer_step > 0 and ( + self.layer_idx + 1) % config.interleave_moe_layer_step == 0 if is_moe_layer: self.feed_forward = Llama4MoE( config=config, -- GitLab From d6da8a8ff22e555ce516ca8ce4d005b1bd1d9fe2 Mon Sep 17 00:00:00 2001 From: Richard Barnes Date: Mon, 28 Apr 2025 19:23:18 -0700 Subject: [PATCH 008/461] [Bugfix] Fix `numel()` downcast in fused_layernorm_dynamic_per_token_quant.cu (#17316) --- .../fused_kernels/fused_layernorm_dynamic_per_token_quant.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu index 2b6ab7fce..95aa92e25 100644 --- a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu +++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu @@ -96,7 +96,7 @@ void rms_norm_dynamic_per_token_quant_dispatch( std::optional const& scale_ub, std::optional& residual) { int32_t hidden_size = input.size(-1); - int32_t num_tokens = input.numel() / hidden_size; + auto num_tokens = input.numel() / hidden_size; dim3 grid(num_tokens); dim3 block(std::min(hidden_size, 1024)); -- GitLab From 165cb56329e6cdbf58741b420c349793ec2390b3 Mon Sep 17 00:00:00 2001 From: Richard Zou Date: Mon, 28 Apr 2025 22:23:29 -0400 Subject: [PATCH 009/461] Ignore `''` filepath (#17330) Signed-off-by: rzou --- vllm/compilation/backends.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index a1d12b517..7012131d0 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -382,6 +382,10 @@ class VllmBackend: hash_content = [] for filepath in forward_code_files: hash_content.append(filepath) + if filepath == "": + # This means the function was dynamically generated, with + # e.g. exec(). We can't actually check these. + continue with open(filepath) as f: hash_content.append(f.read()) import hashlib -- GitLab From 17eb306fcc7018f01e37d6bb548c7cdfc65602a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zhengyuan=20Su=20=28=E8=8B=8F=E6=94=BF=E6=B8=8A=29?= Date: Tue, 29 Apr 2025 10:24:07 +0800 Subject: [PATCH 010/461] [Bugfix] Add contiguous call inside rope kernel wrapper (#17091) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 苏政渊 Co-authored-by: 苏政渊 --- vllm/_custom_ops.py | 17 ++++++++++++++--- vllm/v1/attention/backends/mla/common.py | 7 +++---- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 4c577c1c4..7bb01507a 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -158,8 +158,13 @@ def rotary_embedding( cos_sin_cache: torch.Tensor, is_neox: bool, ) -> None: - torch.ops._C.rotary_embedding(positions, query, key, head_size, - cos_sin_cache, is_neox) + # TODO: Remove this contiguous call when the kernel is updated to support tensor slices + query_contiguous = query.contiguous() + key_contiguous = key.contiguous() + torch.ops._C.rotary_embedding(positions, query_contiguous, key_contiguous, + head_size, cos_sin_cache, is_neox) + query.copy_(query_contiguous) + key.copy_(key_contiguous) def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor, @@ -167,9 +172,15 @@ def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor, cos_sin_cache: torch.Tensor, is_neox: bool, rot_dim: int, cos_sin_cache_offsets: torch.Tensor) -> None: - torch.ops._C.batched_rotary_embedding(positions, query, key, head_size, + # TODO: Remove this contiguous call when the kernel is updated to support tensor slices + query_contiguous = query.contiguous() + key_contiguous = key.contiguous() + torch.ops._C.batched_rotary_embedding(positions, query_contiguous, + key_contiguous, head_size, cos_sin_cache, is_neox, rot_dim, cos_sin_cache_offsets) + query.copy_(query_contiguous) + key.copy_(key_contiguous) # layer norm ops diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index e6e483bae..b032006d1 100644 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -938,8 +938,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): decode_ql_nope, decode_q_pe = \ self._q_proj_and_k_up_proj(decode_hs_or_q_c) decode_q_pe[...], decode_k_pe[...] = self.rotary_emb( - attn_metadata.decode.input_positions, decode_q_pe.contiguous(), - decode_k_pe) + attn_metadata.decode.input_positions, decode_q_pe, decode_k_pe) if has_prefill: assert attn_metadata.prefill is not None @@ -948,8 +947,8 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): prefill_q_pe = prefill_q[..., self.qk_nope_head_dim:] prefill_q_pe[...], prefill_k_pe[...] = self.rotary_emb( - attn_metadata.prefill.input_positions, - prefill_q_pe.contiguous(), prefill_k_pe) + attn_metadata.prefill.input_positions, prefill_q_pe, + prefill_k_pe) # write the latent and rope to kv cache if kv_cache.numel() > 0: -- GitLab From 96e06e3cb73f933bf26ff74599fd96d38c50805c Mon Sep 17 00:00:00 2001 From: Chauncey Date: Tue, 29 Apr 2025 10:53:44 +0800 Subject: [PATCH 011/461] [Misc] Add a Jinja template to support Mistral3 function calling (#17195) Signed-off-by: chaunceyjiang --- examples/tool_chat_template_mistral3.jinja | 119 +++++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 examples/tool_chat_template_mistral3.jinja diff --git a/examples/tool_chat_template_mistral3.jinja b/examples/tool_chat_template_mistral3.jinja new file mode 100644 index 000000000..2b2f94d7e --- /dev/null +++ b/examples/tool_chat_template_mistral3.jinja @@ -0,0 +1,119 @@ +{%- set today = strftime_now("%Y-%m-%d") %} +{%- set default_system_message = "You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.\nYour knowledge base was last updated on 2023-10-01. The current date is " + today + ".\n\nWhen you're not sure about some information, you say that you don't have the information and don't make up anything.\nIf the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. \"What are some good restaurants around me?\" => \"Where are you?\" or \"When is the next flight to Tokyo\" => \"Where do you travel from?\")" %} + +{{- bos_token }} + +{%- if messages[0]['role'] == 'system' %} + {%- if messages[0]['content'] is string %} + {%- set system_message = messages[0]['content'] %} + {%- set loop_messages = messages[1:] %} + {%- else %} + {%- set system_message = messages[0]['content'][0]['text'] %} + {%- set loop_messages = messages[1:] %} + {%- endif %} +{%- else %} + {%- set system_message = default_system_message %} + {%- set loop_messages = messages %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- elif tools is not none %} + {%- set parallel_tool_prompt = "You are a helpful assistant that can call tools. If you call one or more tools, format them in a single JSON array or objects, where each object is a tool call, not as separate objects outside of an array or multiple arrays. Use the format [{\"name\": tool call name, \"arguments\": tool call arguments}, additional tool calls] if you call more than one tool. If you call tools, do not attempt to interpret them or otherwise provide a response until you receive a tool call result that you can interpret for the user." %} + {%- if system_message is defined %} + {%- set system_message = parallel_tool_prompt + "\n\n" + system_message %} + {%- else %} + {%- set system_message = parallel_tool_prompt %} + {%- endif %} +{%- endif %} +{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }} + +{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %} + +{%- for message in loop_messages | rejectattr("role", "equalto", "tool") | rejectattr("role", "equalto", "tool_results") | selectattr("tool_calls", "undefined") %} + {%- if (message["role"] == "user") != (loop.index0 % 2 == 0) %} + {{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }} + {%- endif %} +{%- endfor %} + +{%- for message in loop_messages %} + {%- if message["role"] == "user" %} + {%- if tools is not none and (message == user_messages[-1]) %} + {{- "[AVAILABLE_TOOLS] [" }} + {%- for tool in tools %} + {%- set tool = tool.function %} + {{- '{"type": "function", "function": {' }} + {%- for key, val in tool.items() if key != "return" %} + {%- if val is string %} + {{- '"' + key + '": "' + val + '"' }} + {%- else %} + {{- '"' + key + '": ' + val|tojson }} + {%- endif %} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- "}}" }} + {%- if not loop.last %} + {{- ", " }} + {%- else %} + {{- "]" }} + {%- endif %} + {%- endfor %} + {{- "[/AVAILABLE_TOOLS]" }} + {%- endif %} + {%- if message['content'] is string %} + {{- '[INST]' + message['content'] + '[/INST]' }} + {%- else %} + {{- '[INST]' }} + {%- for block in message['content'] %} + {%- if block['type'] == 'text' %} + {{- block['text'] }} + {%- elif block['type'] == 'image' or block['type'] == 'image_url' %} + {{- '[IMG]' }} + {%- else %} + {{- raise_exception('Only text and image blocks are supported in message content!') }} + {%- endif %} + {%- endfor %} + {{- '[/INST]' }} + {%- endif %} + {%- elif message["role"] == "tool_calls" or message.tool_calls is defined %} + {%- if message.tool_calls is defined %} + {%- set tool_calls = message.tool_calls %} + {%- else %} + {%- set tool_calls = message.content %} + {%- endif %} + {{- "[TOOL_CALLS] [" }} + {%- for tool_call in tool_calls %} + {%- set out = tool_call.function|tojson %} + {{- out[:-1] }} + {%- if not tool_call.id is defined or tool_call.id|length < 9 %} + {{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (1)" + tool_call.id) }} + {%- endif %} + {{- ', "id": "' + tool_call.id[-9:] + '"}' }} + {%- if not loop.last %} + {{- ", " }} + {%- else %} + {{- "]" + eos_token }} + {%- endif %} + {%- endfor %} + {%- elif message['role'] == 'assistant' %} + {%- if message['content'] is string %} + {{- message['content'] + eos_token }} + {%- else %} + {{- message['content'][0]['text'] + eos_token }} + {%- endif %} + {%- elif message["role"] == "tool_results" or message["role"] == "tool" %} + {%- if message.content is defined and message.content.content is defined %} + {%- set content = message.content.content %} + {%- else %} + {%- set content = message.content %} + {%- endif %} + {{- '[TOOL_RESULTS] {"content": ' + content|string + ", " }} + {%- if not message.tool_call_id is defined or message.tool_call_id|length < 9 %} + {{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (2)" + message.tool_call_id) }} + {%- endif %} + {{- '"call_id": "' + message.tool_call_id[-9:] + '"}[/TOOL_RESULTS]' }} + {%- else %} + {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }} + {%- endif %} +{%- endfor %} \ No newline at end of file -- GitLab From cde384cd92c811c2237cf21681166fd41437c8a3 Mon Sep 17 00:00:00 2001 From: qscqesze Date: Tue, 29 Apr 2025 12:05:50 +0800 Subject: [PATCH 012/461] [Model] support MiniMax-VL-01 model (#16328) Signed-off-by: qingjun --- .../vision_language/test_models.py | 13 + .../vision_language/vlm_utils/model_utils.py | 19 + .../processing/test_minimax_vl_01.py | 99 +++ tests/models/registry.py | 2 + vllm/model_executor/models/minimax_text_01.py | 67 +- vllm/model_executor/models/minimax_vl_01.py | 615 ++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + vllm/transformers_utils/config.py | 14 +- vllm/transformers_utils/configs/__init__.py | 4 + .../configs/minimax_text_01.py | 69 ++ .../configs/minimax_vl_01.py | 70 ++ 11 files changed, 954 insertions(+), 19 deletions(-) create mode 100644 tests/models/multimodal/processing/test_minimax_vl_01.py create mode 100644 vllm/model_executor/models/minimax_vl_01.py create mode 100644 vllm/transformers_utils/configs/minimax_text_01.py create mode 100644 vllm/transformers_utils/configs/minimax_vl_01.py diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 9985cb579..6073364c0 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -446,6 +446,19 @@ VLM_TEST_SETTINGS = { hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner, ), + "minimax_vl_01": VLMTestInfo( + models=["MiniMaxAI/MiniMax-VL-01"], + prompt_formatter=lambda img_prompt: f"user: {img_prompt} assistant:", # noqa: E501 + img_idx_to_prompt=lambda _: "", + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + max_model_len=8192, + max_num_seqs=4, + dtype="bfloat16", + hf_output_post_proc=model_utils.minimax_vl_01_hf_output, + patch_hf_runner=model_utils.minimax_vl_01_patch_hf_runner, + auto_cls=AutoModelForImageTextToText, + marks=[large_gpu_mark(min_gb=80)], + ), "molmo": VLMTestInfo( models=["allenai/Molmo-7B-D-0924"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py index 493053327..1185d80b9 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py @@ -229,6 +229,14 @@ def minicpmv_trunc_hf_output(hf_output: RunnerOutput, return output_ids, output_str, out_logprobs +def minimax_vl_01_hf_output(hf_output: RunnerOutput, + model: str) -> RunnerOutput: + output_ids, output_str, out_logprobs = hf_output + if output_str.endswith(""): + output_str = output_str.split("")[0] + return output_ids, output_str, out_logprobs + + ####### Functions for converting image assets to embeddings def get_llava_embeddings(image_assets: _ImageAssets): return [asset.image_embeds for asset in image_assets] @@ -627,6 +635,17 @@ def minicpmv_26_patch_hf_runner(hf_model: HfRunner) -> HfRunner: return hf_model +def minimax_vl_01_patch_hf_runner(hf_model: HfRunner) -> HfRunner: + orig_generate = hf_model.model.generate + + def _generate(self, *args, image_sizes=None, **kwargs): + return orig_generate(*args, decode_text=False, **kwargs) + + hf_model.model.generate = types.MethodType(_generate, hf_model.model) + + return hf_model + + def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner: """Patches and returns an instance of the HfRunner to use for Molmo.""" hf_processor = hf_model.processor diff --git a/tests/models/multimodal/processing/test_minimax_vl_01.py b/tests/models/multimodal/processing/test_minimax_vl_01.py new file mode 100644 index 000000000..d333c32dc --- /dev/null +++ b/tests/models/multimodal/processing/test_minimax_vl_01.py @@ -0,0 +1,99 @@ +# SPDX-License-Identifier: Apache-2.0 + +import pytest +from PIL import Image + +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.parse import ImageSize +from vllm.multimodal.processing import BaseMultiModalProcessor + +from ....conftest import _ImageAssets +from ...utils import build_model_context + + +@pytest.mark.parametrize("model_id", ["MiniMaxAI/MiniMax-VL-01"]) +# yapf: enable +@pytest.mark.parametrize("num_imgs", [1, 2]) +def test_processor_override( + image_assets: _ImageAssets, + model_id: str, + num_imgs: int, +): + ctx = build_model_context( + model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": num_imgs}, + ) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + prompt = "" * num_imgs + image = Image.new("RGB", size=(364, 364)) + mm_data = {"image": [image] * num_imgs} + + processed_inputs = processor.apply(prompt, mm_data, {}) + image_placeholders = processed_inputs["mm_placeholders"]["image"] + + assert len(image_placeholders) == num_imgs + + +def _validate_image_prompt_replacements_one( + processor: BaseMultiModalProcessor, + num_imgs: int, + failed_size_excs: list[tuple[ImageSize, Exception]], + image_size: ImageSize, +) -> None: + prompt = "" * num_imgs + image = Image.new("RGB", size=image_size) + mm_data = {"image": [image] * num_imgs} + + try: + processed_inputs = processor.apply(prompt, mm_data, {}) + + image_placeholders = processed_inputs["mm_placeholders"]["image"] + assert len(image_placeholders) == num_imgs + + except Exception as exc: + failed_size_excs.append((image_size, exc)) + + +def _test_image_prompt_replacements( + processor, + *, + num_imgs: int, + image_sizes: list[ImageSize], +) -> None: + + failed_size_excs = list[tuple[ImageSize, Exception]]() + + for size in image_sizes: + _validate_image_prompt_replacements_one(processor, num_imgs, + failed_size_excs, size) + + if failed_size_excs: + msg = "Found failing image sizes:" \ + + "\n========\n".join(f"[{size}]\n{exc}" + for size, exc in failed_size_excs) + raise AssertionError(msg) + + +@pytest.mark.parametrize("model_id", ["MiniMaxAI/MiniMax-VL-01"]) +@pytest.mark.parametrize("num_imgs", [1, 2]) +def test_processor_prompt_replacements_regression(model_id, num_imgs): + ctx = build_model_context( + model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": num_imgs}, + ) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + + image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328), + (488, 183), (2560, 1669)] + image_sizes = [ + size for w, h in image_ratios + for size in [ImageSize(w, h), ImageSize(h, w)] + ] + + _test_image_prompt_replacements( + processor, + num_imgs=num_imgs, + image_sizes=image_sizes, + ) diff --git a/tests/models/registry.py b/tests/models/registry.py index a08924639..a3c5bc865 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -337,6 +337,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5", extras={"2.6": "openbmb/MiniCPM-V-2_6"}, # noqa: E501 trust_remote_code=True), + "MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo("MiniMaxAI/MiniMax-VL-01", # noqa: E501 + trust_remote_code=True), "Mistral3ForConditionalGeneration": _HfExamplesInfo("mistralai/Mistral-Small-3.1-24B-Instruct-2503", # noqa: E501 extras={"fp8": "nm-testing/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"}), # noqa: E501 "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924", diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 74be08159..951f4e230 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -3,7 +3,7 @@ import copy import math import re -from typing import Dict, Iterable, List, Optional, Tuple, Union +from typing import Dict, Iterable, List, Optional, Set, Tuple, Union import torch import torch.distributed @@ -110,7 +110,17 @@ class MiniMaxText01RMSNormTP(CustomOp): variance = tensor_model_parallel_all_reduce( variance) / self.tp_world x = x * torch.rsqrt(variance + self.variance_epsilon) - x = x.to(orig_dtype) * self.weight + + weight = self.weight + if x.size(-1) != self.weight.size(0): + if self.weight.size(0) < x.size(-1): + repeat_count = (x.size(-1) + self.weight.size(0)) // x.size(-1) + full_weight = self.weight.repeat(repeat_count) + weight = full_weight[:x.size(-1)] + else: + weight = self.weight[:x.size(-1)] + + x = x.to(orig_dtype) * weight return x def forward( @@ -421,6 +431,10 @@ class MiniMaxText01LinearAttention(nn.Module): attn_metadata): hidden = [] for _prefill_idx in range(getattr(attn_metadata, "num_prefills", 0)): + if _prefill_idx >= len(attn_metadata.query_start_loc): + break + if _prefill_idx >= len(state_indices_tensor): + break _start = attn_metadata.query_start_loc[_prefill_idx] _end = attn_metadata.query_start_loc[_prefill_idx + 1] slot_id = state_indices_tensor[_prefill_idx] @@ -443,6 +457,10 @@ class MiniMaxText01LinearAttention(nn.Module): hidden.append( self._decode_infer(q, k, v, kv_cache, state_indices_tensor, attn_metadata)) + + if not hidden: + return torch.empty((0, q.size(-1)), device=q.device, dtype=q.dtype) + hidden = torch.concat(hidden, dim=0).contiguous() return hidden @@ -663,6 +681,9 @@ class MiniMaxText01DecoderLayer(nn.Module): self.shared_moe = False shared_intermediate = getattr(config, 'shared_intermediate_size', 0) + if isinstance(shared_intermediate, list): + shared_intermediate = shared_intermediate[ + layer_id] if layer_id < len(shared_intermediate) else 0 if shared_intermediate > 0: self.shared_moe = True self.shared_mlp = MiniMaxText01MLP( @@ -875,6 +896,8 @@ class MiniMaxText01Model(nn.Module): slots_to_clear = [] for _prefill_id in range(getattr(attn_metadata, "num_prefills", 0)): + if _prefill_id >= len(seq_id_map): + break seq_id = seq_id_map[_prefill_id] if attn_metadata.context_lens_tensor[ _prefill_id] == 0 and seq_id in seq_to_slot_maps: @@ -886,13 +909,18 @@ class MiniMaxText01Model(nn.Module): dtype=torch.long) minimax_cache_tensors[:, slots_tensor, ...] = 0 + def get_input_embeddings( + self, + input_ids: torch.Tensor, + ) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward(self, input_ids: Optional[torch.Tensor], positions: torch.Tensor, - kv_caches: List[torch.Tensor], - intermediate_tensors=None, + intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, - **kwargs) -> torch.Tensor: + **kwargs) -> Union[torch.Tensor, IntermediateTensors]: forward_context = get_forward_context() attn_metadata = forward_context.attn_metadata if attn_metadata is None: @@ -901,6 +929,7 @@ class MiniMaxText01Model(nn.Module): kwargs["request_ids_to_seq_ids"] = {} if "finished_requests_ids" not in kwargs: kwargs["finished_requests_ids"] = [] + ( minimax_cache_tensors, state_indices_tensor, @@ -922,15 +951,11 @@ class MiniMaxText01Model(nn.Module): hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - kv_cache_index = 0 minimax_cache_index = 0 attn_metadata.rotary_emb = self.rotary_emb for i in range(self.start_layer, self.end_layer): layer = self.layers[i] _caches = None - if isinstance(layer.self_attn, MiniMaxText01Attention): - _caches = kv_caches[kv_cache_index] - kv_cache_index += 1 if isinstance(layer.self_attn, MiniMaxText01LinearAttention): current_state_layer = minimax_cache_index _caches = minimax_cache_params.at_layer_idx( @@ -1009,15 +1034,20 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid, return self.model.minimax_cache.get_seqlen_agnostic_capture_inputs( batch_size) + def get_input_embeddings( + self, + input_ids: torch.Tensor, + ) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, **kwargs) -> torch.Tensor: - hidden_states = self.model(input_ids, positions, self.kv_cache, - intermediate_tensors, inputs_embeds, - **kwargs) + hidden_states = self.model(input_ids, positions, intermediate_tensors, + inputs_embeds, **kwargs) return hidden_states @@ -1043,8 +1073,9 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid, }) def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> None: + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() def which_layer(name: str) -> int: if "layers" in name: @@ -1108,6 +1139,7 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid, weight_name, expert_id=expert_id, shard_id=shard_id) + loaded_params.add(name) break else: if is_pp_missing_parameter(name, self): @@ -1117,6 +1149,7 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid, default_weight_loader) weight_loader = weight_loader_with_alias(name)(weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) return def is_shared_mlp_weight(name: str) -> bool: @@ -1154,6 +1187,7 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid, else: raise AssertionError( "MLP weight not in [gate_up_proj, down_proj]") + loaded_params.add(name) return def is_mha_weight(name: str) -> bool: @@ -1170,6 +1204,7 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid, MiniMaxText01LinearAttention.weight_direct_load) weight_loader = weight_loader_with_alias(name)(weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) return def load_flash_attn_weight(name: str, loaded_weight: torch.Tensor, @@ -1194,6 +1229,7 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid, default_weight_loader) weight_loader = weight_loader_with_alias(name)(weight_loader) weight_loader(param, loaded_weight, shard_id) + loaded_params.add(name) break else: if is_pp_missing_parameter(name, self): @@ -1204,6 +1240,7 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid, default_weight_loader) weight_loader = weight_loader_with_alias(name)(weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) return def is_layer_norm_weight(name: str) -> bool: @@ -1219,6 +1256,7 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid, default_weight_loader) weight_loader = weight_loader_with_alias(name)(weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) return def load_basic_weight(name: str, loaded_weight: torch.Tensor, @@ -1230,6 +1268,7 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid, default_weight_loader) weight_loader = weight_loader_with_alias(name)(weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) return for name, loaded_weight in weights: @@ -1258,4 +1297,4 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid, continue load_basic_weight(name, loaded_weight, self) - return + return loaded_params diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py new file mode 100644 index 000000000..14e105586 --- /dev/null +++ b/vllm/model_executor/models/minimax_vl_01.py @@ -0,0 +1,615 @@ +# SPDX-License-Identifier: Apache-2.0 + +from abc import abstractmethod +from collections.abc import Iterable, Mapping, Sequence +from dataclasses import dataclass +from typing import (Final, Literal, Optional, Protocol, Set, Tuple, TypedDict, + TypeVar, Union, cast) + +import numpy as np +import torch +import torch.nn as nn +from transformers import BatchFeature, CLIPVisionConfig, PretrainedConfig +from transformers.image_processing_utils import select_best_resolution + +from vllm.config import VllmConfig +from vllm.jsontree import json_map_leaves +from vllm.logger import init_logger +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict +from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs +from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, + ImageSize, MultiModalDataItems) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, PromptReplacement, + PromptUpdate) +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.configs.minimax_vl_01 import MiniMaxVL01Config + +from .clip import CLIPVisionModel +from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP +from .pixtral import PixtralHFVisionModel +from .siglip import SiglipVisionModel +from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, + maybe_prefix, merge_multimodal_embeddings) +from .vision import get_vision_encoder_info + +logger = init_logger(__name__) + + +# For dummy input only +@dataclass +class MaxImageTokenMeta: + width: int = 1024 + height: int = 1024 + + +class MiniMaxVL01ImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + pixel_values: torch.Tensor + """ + Shape: `(batch_size * num_images, num_channels, height, width)` + + Note that `height` or `width` may be different per batch and image, + in which case the data is passed as a list instead of a batched tensor. + """ + + +class MiniMaxVL01ImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + data: torch.Tensor + """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` + + `hidden_size` must match the hidden size of language model backbone. + """ + + +def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int): + if not isinstance(grid_pinpoints, list): + raise TypeError("grid_pinpoints should be a list of tuples or lists") + + # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, + # otherwise it will cause wrong calculate + if not isinstance(image_size, (list, tuple)): + if not isinstance(image_size, (torch.Tensor, np.ndarray)): + raise TypeError("image_size invalid type " + + f"{type(image_size)} with value {image_size}") + image_size = image_size.tolist() + + best_resolution = select_best_resolution(image_size, grid_pinpoints) + height, width = best_resolution + num_patches = 0 + # consider change to ceil(height/patch_size)*ceil(width/patch_size) + 1 + for i in range(0, height, patch_size): + for j in range(0, width, patch_size): + num_patches += 1 + # add the base patch + num_patches += 1 + return num_patches + + +def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): + if not isinstance(grid_pinpoints, list): + raise TypeError("grid_pinpoints should be a list of tuples or lists") + + # ! VERY IMPORTANT if image_size is tensor, + # must convert to into tuple, + # otherwise it will cause wrong calculate + if not isinstance(image_size, (list, tuple)): + if not isinstance(image_size, (torch.Tensor, np.ndarray)): + raise TypeError( + "image_size invalid type " + + f"{type(image_size)} not valid, " + + "should be either list, tuple, np.ndarray or tensor") + image_size = image_size.tolist() + + height, width = select_best_resolution(image_size, grid_pinpoints) + return height // patch_size, width // patch_size + + +def unpad_image(tensor, original_size): + original_height, original_width = original_size + current_height, current_width = tensor.shape[1:] + + original_aspect_ratio = original_width / original_height + current_aspect_ratio = current_width / current_height + + if original_aspect_ratio > current_aspect_ratio: + new_height = int(original_height * current_width) // original_width + padding = (current_height - new_height) // 2 + unpadded_tensor = tensor[:, padding:current_height - padding, :] + else: + new_width = int(original_width * current_height) // original_height + padding = (current_width - new_width) // 2 + unpadded_tensor = tensor[:, :, padding:current_width - padding] + + return unpadded_tensor + + +class MiniMaxVL01MultiModalProjector(nn.Module): + + def __init__(self, + vision_hidden_size: int, + text_hidden_size: int, + projector_hidden_act: str, + multimodal_projector_bias: bool, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): + super().__init__() + + self.linear_1 = ColumnParallelLinear(vision_hidden_size, + text_hidden_size, + bias=multimodal_projector_bias, + quant_config=quant_config, + prefix=f"{prefix}.linear_1") + self.act = get_act_fn(projector_hidden_act) + self.linear_2 = RowParallelLinear(text_hidden_size, + text_hidden_size, + bias=multimodal_projector_bias, + quant_config=quant_config, + prefix=f"{prefix}.linear_2") + + def forward(self, image_features: torch.Tensor) -> torch.Tensor: + hidden_states, _ = self.linear_1(image_features) + hidden_states = self.act(hidden_states) + hidden_states, _ = self.linear_2(hidden_states) + return hidden_states + + +class MiniMaxVL01LikeConfig(Protocol): + vision_config: Final[PretrainedConfig] + image_token_index: Final[int] + vision_feature_select_strategy: Final[str] + vision_feature_layer: Final[Union[int, list[int]]] + + +class MiniMaxVL01LikeProcessor(Protocol): + image_token: Final[str] + + +_I = TypeVar("_I", bound=BaseProcessingInfo) + + +class MiniMaxVL01DummyInputsBuilder(BaseDummyInputsBuilder[_I]): + + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + processor = self.info.get_hf_processor() + image_token = processor.image_token + return image_token * num_images + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + num_images = mm_counts.get("image", 0) + + return { + "image": + self._get_dummy_images(width=MaxImageTokenMeta.width, + height=MaxImageTokenMeta.height, + num_images=num_images) + } + + +class MiniMaxVL01ProcessingInfo(BaseProcessingInfo): + + def get_hf_config(self): + return self.ctx.get_hf_config(MiniMaxVL01Config) + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + + def get_vision_encoder_info(self): + return get_vision_encoder_info(self.get_hf_config()) + + def _apply_feature_select_strategy( + self, + strategy: str, + encoder_num_image_tokens: int, + ) -> int: + if strategy == "default": + return encoder_num_image_tokens - 1 + if strategy == "full": + return encoder_num_image_tokens + + msg = f"Unexpected feature select strategy: {strategy!r}" + raise NotImplementedError(msg) + + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + hf_config = self.get_hf_config() + vision_encoder_info = self.get_vision_encoder_info() + + return self._apply_feature_select_strategy( + hf_config.vision_feature_select_strategy, + vision_encoder_info.get_num_image_tokens( + image_width=image_width, + image_height=image_height, + ), + ) + + def get_image_size_with_most_features(self) -> ImageSize: + vision_encoder_info = self.get_vision_encoder_info() + width = height = vision_encoder_info.get_image_size() + return ImageSize(width=width, height=height) + + def get_max_image_tokens(self) -> int: + target_width, target_height = self.get_image_size_with_most_features() + + return self.get_num_image_tokens( + image_width=target_width, + image_height=target_height, + ) + + +class BaseMiniMaxVL01MultiModalProcessor(BaseMultiModalProcessor[_I]): + + # Copied from BaseMultiModalProcessor + @abstractmethod + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + raise NotImplementedError + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> Sequence[PromptUpdate]: + hf_config = self.info.get_hf_config() + image_token_id = hf_config.image_token_index + + def get_replacement(item_idx: int): + images = mm_items.get_items( + "image", (ImageEmbeddingItems, ImageProcessorItems)) + + if isinstance(images, ImageEmbeddingItems): + num_image_tokens = images.get_feature_size(item_idx) + else: + image_size = images.get_image_size(item_idx) + num_image_tokens = self.info.get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + ) + + return [image_token_id] * num_image_tokens + + return [ + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=get_replacement, + ), + ] + + +class MiniMaxVL01MultiModalProcessor( + BaseMiniMaxVL01MultiModalProcessor[MiniMaxVL01ProcessingInfo]): + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + processed_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + ) + + pixel_values = processed_outputs.get("pixel_values") + if pixel_values is not None: + image_sizes = processed_outputs["image_sizes"] + min_len = min(len(pixel_values), len(image_sizes)) + pixel_values = pixel_values[:min_len] + image_sizes = image_sizes[:min_len] + assert len(pixel_values) == len(image_sizes) + + processed_outputs["pixel_values"] = [ + p[:, :h, :w] for p, (h, w) in zip(pixel_values, image_sizes) + ] + + return processed_outputs + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return { + "pixel_values": MultiModalFieldConfig.batched("image"), + "image_embeds": MultiModalFieldConfig.batched("image"), + } + + +def _get_num_hidden_layers(hf_config: MiniMaxVL01LikeConfig) -> int: + """Determine the number of hidden layers to initialize up to in the + visual encoder. + + Args: + hf_config: Model config with vision feature layer(s). + """ + feature_layers = hf_config.vision_feature_layer + num_hidden_layers = hf_config.vision_config.num_hidden_layers + # If we have one feature layer, initialize up to that layer + if isinstance(feature_layers, int): + return _get_layer_index(feature_layers, num_hidden_layers) + # If we have multiple feature layers, initialize up to the deepest one + elif isinstance(feature_layers, (list, tuple)): + return max( + _get_layer_index(idx, num_hidden_layers) for idx in feature_layers) + raise TypeError(f"vision_layer_feature type: {type(feature_layers)}" + " is not supported") + + +def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int: + """Given a signed vision feature layer, get the number of hidden layers + needed to leverage it. + + Args: + feature_layer_index: Index of a required layer in the visual encoder. + num_hidden_layers: The total number of hidden layers in the visual + encoder. + """ + if feature_layer_index < 0: + return num_hidden_layers + feature_layer_index + 1 + return feature_layer_index + + +def init_vision_tower_for_MiniMaxVL01( + hf_config: MiniMaxVL01LikeConfig, + quant_config: Optional[QuantizationConfig], + *, + require_post_norm: Optional[bool] = None, + prefix: str = "", +) -> Union[CLIPVisionModel, SiglipVisionModel, PixtralHFVisionModel]: + vision_config = hf_config.vision_config + + # Initialize the vision tower only up to the deepest required feature layer + num_hidden_layers = _get_num_hidden_layers(hf_config) + + if isinstance(vision_config, CLIPVisionConfig): + return CLIPVisionModel( + vision_config, + quant_config=quant_config, + num_hidden_layers_override=num_hidden_layers, + require_post_norm=require_post_norm, + prefix=prefix, + ) + + msg = f"Unsupported vision config: {type(vision_config)}" + raise NotImplementedError(msg) + + +@MULTIMODAL_REGISTRY.register_processor( + MiniMaxVL01MultiModalProcessor, + info=MiniMaxVL01ProcessingInfo, + dummy_inputs=MiniMaxVL01DummyInputsBuilder) +class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, + SupportsPP): + + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"] + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + super().__init__() + + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config + + self.config = config + self.multimodal_config = multimodal_config + + # TODO: Optionally initializes this for supporting embeddings. + self.vision_tower = init_vision_tower_for_MiniMaxVL01( + config, + quant_config, + require_post_norm=False, + prefix=maybe_prefix(prefix, "vision_tower")) + self.multi_modal_projector = MiniMaxVL01MultiModalProjector( + vision_hidden_size=config.vision_config.hidden_size, + text_hidden_size=config.text_config.hidden_size, + projector_hidden_act=config.projector_hidden_act, + multimodal_projector_bias=True, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "multi_modal_projector")) + self.image_newline = nn.Parameter( + torch.empty(config.text_config.hidden_size)) + self.language_model = init_vllm_registered_model( + vllm_config=vllm_config, + hf_config=config.text_config, + prefix=maybe_prefix(prefix, "language_model"), + ) + self.vision_feature_layer = config.vision_feature_layer + self.vocab_size = config.text_config.vocab_size + self.pad_token_id = -1 + if self.config.pad_token_id is not None: + self.pad_token_id = self.config.pad_token_id + + self.make_empty_intermediate_tensors = ( + self.language_model.make_empty_intermediate_tensors) + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + multimodal_embeddings, + self.config.image_token_index, + ) + return inputs_embeds + + def get_language_model(self) -> torch.nn.Module: + return self.language_model + + def _select_image_features(self, image_features: torch.Tensor, *, + strategy: str) -> torch.Tensor: + if strategy == "default": + return image_features[:, 1:] + elif strategy == "full": + return image_features + + raise ValueError(f"Unexpected select feature strategy: {strategy}") + + def _image_pixels_to_features( + self, + vision_tower: Union[CLIPVisionModel], + pixel_values: Union[torch.Tensor, list[torch.Tensor]], + ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]: + # NOTE: we skip the step to select the vision feature layer since + # this is already done inside the vision tower + image_features = vision_tower(pixel_values) + + def select_features(leaf: torch.Tensor): + return self._select_image_features( + leaf, + strategy=self.config.vision_feature_select_strategy, + ) + + return cast( + Union[torch.Tensor, tuple[torch.Tensor, ...]], + json_map_leaves(select_features, image_features), + ) + + def _process_image_pixels( + self, + inputs: Union[MiniMaxVL01ImagePixelInputs], + ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]: + assert self.vision_tower is not None + + pixel_values = inputs["pixel_values"] + + return self._image_pixels_to_features(self.vision_tower, pixel_values) + + def _process_image_input( + self, + image_input: MiniMaxVL01ImagePixelInputs, + ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]: + if image_input["type"] == "image_embeds": + return image_input["data"] + + assert self.vision_tower is not None + image_features = self._process_image_pixels(image_input) + + if isinstance(image_features, torch.Tensor): + return self.multi_modal_projector(image_features) + + feature_sizes = [ + image_feature.shape[0] for image_feature in image_features + ] + + image_embeds = self.multi_modal_projector(torch.cat(image_features)) + image_embeds = torch.split(image_embeds, feature_sizes) + return image_embeds + + def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: + h = w = self.config.vision_config.image_size + expected_dims = (3, h, w) + actual_dims = tuple(data.shape[1:]) + + if actual_dims != expected_dims: + expected_expr = ("batch_size", *map(str, expected_dims)) + raise ValueError( + f"The expected shape of pixel values is {expected_expr}. " + f"You supplied {tuple(data.shape)}.") + + return data + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[MiniMaxVL01ImagePixelInputs]: + pixel_values = kwargs.pop("pixel_values", None) + image_embeds = kwargs.pop("image_embeds", None) + + if pixel_values is None and image_embeds is None: + return None + + if pixel_values is not None: + if not isinstance(pixel_values, (torch.Tensor, list)): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") + + return MiniMaxVL01ImagePixelInputs( + type="pixel_values", + pixel_values=self._validate_pixel_values( + flatten_bn(pixel_values, concat=True)), + ) + + if image_embeds is not None: + if not isinstance(image_embeds, (torch.Tensor, list)): + raise ValueError("Incorrect type of image embeddings. " + f"Got type: {type(image_embeds)}") + + return MiniMaxVL01ImageEmbeddingInputs( + type="image_embeds", + data=flatten_bn(image_embeds, concat=True), + ) + + raise AssertionError("This line should be unreachable.") + + def get_multimodal_embeddings( + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + + return self._process_image_input(image_input) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + + if intermediate_tensors is not None: + inputs_embeds = None + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None + + hidden_states = self.language_model.model(input_ids, + positions, + intermediate_tensors, + inputs_embeds=inputs_embeds) + + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + return self.language_model.compute_logits(hidden_states, + sampling_metadata) + + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 33877829f..df5b23232 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -189,6 +189,7 @@ _MULTIMODAL_MODELS = { "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), # noqa: E501 "LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), # noqa: E501 "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"), # noqa: E501 + "MiniMaxVL01ForConditionalGeneration": ("minimax_vl_01", "MiniMaxVL01ForConditionalGeneration"), # noqa: E501 "MiniCPMO": ("minicpmo", "MiniCPMO"), "MiniCPMV": ("minicpmv", "MiniCPMV"), "Mistral3ForConditionalGeneration": ("mistral3", "Mistral3ForConditionalGeneration"), # noqa: E501 diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index e062afd68..5ddfadb02 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -34,11 +34,13 @@ from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config, H2OVLChatConfig, InternVLChatConfig, JAISConfig, KimiVLConfig, MedusaConfig, - MllamaConfig, MLPSpeculatorConfig, - MPTConfig, NemotronConfig, - NVLM_D_Config, RWConfig, - SkyworkR1VChatConfig, SolarConfig, - Telechat2Config, UltravoxConfig) + MiniMaxText01Config, + MiniMaxVL01Config, MllamaConfig, + MLPSpeculatorConfig, MPTConfig, + NemotronConfig, NVLM_D_Config, + RWConfig, SkyworkR1VChatConfig, + SolarConfig, Telechat2Config, + UltravoxConfig) # yapf: enable from vllm.transformers_utils.utils import check_gguf_file from vllm.utils import resolve_obj_by_qualname @@ -73,6 +75,8 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { "exaone": ExaoneConfig, "h2ovl_chat": H2OVLChatConfig, "internvl_chat": InternVLChatConfig, + "minimax_text_01": MiniMaxText01Config, + "minimax_vl_01": MiniMaxVL01Config, "nemotron": NemotronConfig, "NVLM_D": NVLM_D_Config, "solar": SolarConfig, diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 8812d4c48..8945c45ea 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -15,6 +15,8 @@ from vllm.transformers_utils.configs.internvl import InternVLChatConfig from vllm.transformers_utils.configs.jais import JAISConfig from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig from vllm.transformers_utils.configs.medusa import MedusaConfig +from vllm.transformers_utils.configs.minimax_text_01 import MiniMaxText01Config +from vllm.transformers_utils.configs.minimax_vl_01 import MiniMaxVL01Config from vllm.transformers_utils.configs.mllama import MllamaConfig from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig from vllm.transformers_utils.configs.moonvit import MoonViTConfig @@ -39,6 +41,8 @@ __all__ = [ "MedusaConfig", "EAGLEConfig", "ExaoneConfig", + "MiniMaxText01Config", + "MiniMaxVL01Config", "MllamaConfig", "MLPSpeculatorConfig", "MoonViTConfig", diff --git a/vllm/transformers_utils/configs/minimax_text_01.py b/vllm/transformers_utils/configs/minimax_text_01.py new file mode 100644 index 000000000..660e870ac --- /dev/null +++ b/vllm/transformers_utils/configs/minimax_text_01.py @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: Apache-2.0 +""" MiniMaxText01 model configuration""" + +from transformers.configuration_utils import PretrainedConfig + + +class MiniMaxText01Config(PretrainedConfig): + model_type = "MiniMaxText01" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=32000, + hidden_size=4096, + intermediate_size=14336, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=8, + hidden_act="silu", + max_position_embeddings=4096 * 32, + initializer_range=0.02, + rms_norm_eps=1e-5, + use_cache=True, + pad_token_id=None, + bos_token_id=None, + eos_token_id=None, + tie_word_embeddings=False, + rope_theta=1e6, + sliding_window=None, + attention_dropout=0.0, + num_experts_per_tok=2, + num_local_experts=8, + output_router_logits=False, + router_aux_loss_coef=0.001, + router_jitter_noise=0.0, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.sliding_window = sliding_window + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_dropout = attention_dropout + + self.num_experts_per_tok = num_experts_per_tok + self.num_local_experts = num_local_experts + self.output_router_logits = output_router_logits + self.router_aux_loss_coef = router_aux_loss_coef + self.router_jitter_noise = router_jitter_noise + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/vllm/transformers_utils/configs/minimax_vl_01.py b/vllm/transformers_utils/configs/minimax_vl_01.py new file mode 100644 index 000000000..99e0d249d --- /dev/null +++ b/vllm/transformers_utils/configs/minimax_vl_01.py @@ -0,0 +1,70 @@ +# SPDX-License-Identifier: Apache-2.0 +"""MiniMaxVL01 model configuration""" + +from transformers.configuration_utils import PretrainedConfig +from transformers.models.auto import CONFIG_MAPPING + +from .minimax_text_01 import MiniMaxText01Config + + +class MiniMaxVL01Config(PretrainedConfig): + model_type = "minimax_vl_01" + + def __init__( + self, + vision_config=None, + text_config=None, + ignore_index=-100, + image_token_index=32000, + projector_hidden_act="gelu", + vision_feature_select_strategy="default", + vision_feature_layer=-2, + image_grid_pinpoints=None, + tie_word_embeddings=False, + image_seq_length=576, + **kwargs, + ): + self.ignore_index = ignore_index + self.image_token_index = image_token_index + self.projector_hidden_act = projector_hidden_act + self.image_seq_length = image_seq_length + + if vision_feature_select_strategy not in ["default", "full"]: + raise ValueError("vision_feature_select_strategy should " + + "be one of 'default', 'full'." + + f"Got: {vision_feature_select_strategy}") + + self.vision_feature_select_strategy = vision_feature_select_strategy + self.vision_feature_layer = vision_feature_layer + image_grid_pinpoints = ( + image_grid_pinpoints if image_grid_pinpoints is not None else + [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]) + self.image_grid_pinpoints = image_grid_pinpoints + + if isinstance(vision_config, dict): + if "model_type" not in vision_config: + vision_config["model_type"] = "clip_vision_model" + vision_config = CONFIG_MAPPING[vision_config["model_type"]]( + **vision_config) + elif vision_config is None: + vision_config = CONFIG_MAPPING["clip_vision_model"]( + intermediate_size=4096, + hidden_size=1024, + patch_size=14, + image_size=336, + num_hidden_layers=24, + num_attention_heads=16, + vocab_size=32000, + projection_dim=768, + ) + + self.vision_config = vision_config + + if text_config is not None: + text_config = MiniMaxText01Config(**text_config) + else: + text_config = MiniMaxText01Config() + + self.text_config = text_config + + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) -- GitLab From ebb3930d28927da0e432ba8923ef9f83c6fb12f5 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 29 Apr 2025 14:37:21 +0800 Subject: [PATCH 013/461] [Misc] Move config fields to MultiModalConfig (#17343) Signed-off-by: DarkLight1337 --- vllm/config.py | 57 +++++++++++++++++++------- vllm/engine/arg_utils.py | 16 ++------ vllm/inputs/registry.py | 6 ++- vllm/model_executor/models/qwen2_vl.py | 5 ++- vllm/multimodal/registry.py | 3 +- vllm/transformers_utils/processor.py | 3 +- vllm/v1/engine/mm_input_cache.py | 5 ++- vllm/v1/engine/processor.py | 3 +- 8 files changed, 62 insertions(+), 36 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 3ed1674b5..c1c72846d 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -263,6 +263,10 @@ class ModelConfig: the model name will be the same as `model`. limit_mm_per_prompt: Maximum number of data items per modality per prompt. Only applicable for multimodal models. + mm_processor_kwargs: Overrides for the multi-modal processor obtained + from `AutoProcessor.from_pretrained`. + disable_mm_preprocessor_cache: If True, disable caching of the + processed multi-modal inputs. use_async_output_proc: Whether to use async output processor. Defaults to True. config_format: The config format which shall be loaded. @@ -273,10 +277,6 @@ class ModelConfig: hf_overrides: If a dictionary, contains arguments to be forwarded to the HuggingFace config. If a callable, it is called to update the HuggingFace config. - mm_processor_kwargs: Arguments to be forwarded to the model's processor - for multi-modal data, e.g., image processor. - disable_mm_preprocessor_cache: If true, then disables caching of the - multi-modal preprocessor/mapper. (not recommended) override_neuron_config: Initialize non default neuron config or override default neuron config that are specific to Neuron devices, this argument will be used to configure the neuron config that @@ -320,7 +320,6 @@ class ModelConfig: factors.append(self.max_logprobs) factors.append(self.disable_sliding_window) factors.append(self.trust_remote_code) - factors.append(self.mm_processor_kwargs) factors.append(self.generation_config) factors.append(self.model_impl) factors.append(self.override_generation_config) @@ -359,12 +358,12 @@ class ModelConfig: skip_tokenizer_init: bool = False, served_model_name: Optional[Union[str, list[str]]] = None, limit_mm_per_prompt: Optional[dict[str, int]] = None, + mm_processor_kwargs: Optional[dict[str, Any]] = None, + disable_mm_preprocessor_cache: bool = False, use_async_output_proc: bool = True, config_format: ConfigFormat = ConfigFormat.AUTO, hf_token: Optional[Union[bool, str]] = None, hf_overrides: Optional[HfOverrides] = None, - mm_processor_kwargs: Optional[dict[str, Any]] = None, - disable_mm_preprocessor_cache: bool = False, override_neuron_config: Optional[dict[str, Any]] = None, override_pooler_config: Optional["PoolerConfig"] = None, logits_processor_pattern: Optional[str] = None, @@ -469,8 +468,6 @@ class ModelConfig: self.model, hf_token=hf_token, revision=revision) self.dtype = _get_and_verify_dtype(self.hf_config, dtype) self.use_async_output_proc = use_async_output_proc - self.mm_processor_kwargs = mm_processor_kwargs - self.disable_mm_preprocessor_cache = disable_mm_preprocessor_cache # Set enforce_eager to False if the value is unset. if self.enforce_eager is None: @@ -515,7 +512,10 @@ class ModelConfig: self.served_model_name = get_served_model_name(model, served_model_name) self.multimodal_config = self._init_multimodal_config( - limit_mm_per_prompt) + limit_mm_per_prompt=limit_mm_per_prompt, + mm_processor_kwargs=mm_processor_kwargs, + disable_mm_preprocessor_cache=disable_mm_preprocessor_cache, + ) if not self.skip_tokenizer_init: self._verify_tokenizer_mode() @@ -581,14 +581,27 @@ class ModelConfig: self.tokenizer = s3_tokenizer.dir def _init_multimodal_config( - self, limit_mm_per_prompt: Optional[dict[str, int]] + self, + limit_mm_per_prompt: Optional[dict[str, int]], + mm_processor_kwargs: Optional[dict[str, Any]], + disable_mm_preprocessor_cache: bool, ) -> Optional["MultiModalConfig"]: if self.registry.is_multimodal_model(self.architectures): - return MultiModalConfig(limit_per_prompt=limit_mm_per_prompt or {}) + return MultiModalConfig( + limit_per_prompt=limit_mm_per_prompt or {}, + mm_processor_kwargs=mm_processor_kwargs or {}, + disable_mm_preprocessor_cache=disable_mm_preprocessor_cache, + ) if limit_mm_per_prompt: raise ValueError("`limit_mm_per_prompt` is only supported for " "multimodal models.") + if mm_processor_kwargs: + raise ValueError("`mm_processor_kwargs` is only supported for " + "multimodal models.") + if disable_mm_preprocessor_cache: + raise ValueError("`disable_mm_preprocessor_cache` is only " + "supported for multimodal models.") return None @@ -2776,7 +2789,23 @@ class MultiModalConfig: Defaults to 1 (V0) or 999 (V1) for each modality. For example, to allow up to 16 images and 2 videos per prompt: - ``{"images": 16, "videos": 2}`` + :code:`{"images": 16, "videos": 2}` + """ + + mm_processor_kwargs: Optional[dict[str, object]] = None + """ + Overrides for the multi-modal processor obtained from + :meth:`transformers.AutoProcessor.from_pretrained`. + + The available overrides depend on the model that is being run. + + For example, for Phi-3-Vision: + :code:`{"num_crops": 4}`. + """ + + disable_mm_preprocessor_cache: bool = False + """ + If :code:`True`, disable caching of the processed multi-modal inputs. """ def compute_hash(self) -> str: @@ -4080,8 +4109,6 @@ class VllmConfig: f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, " f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, " # noqa f"use_async_output_proc={self.model_config.use_async_output_proc}, " - f"disable_mm_preprocessor_cache={self.model_config.disable_mm_preprocessor_cache!r}, " # noqa - f"mm_processor_kwargs={self.model_config.mm_processor_kwargs}, " f"pooler_config={self.model_config.pooler_config!r}, " f"compilation_config={self.compilation_config!r}") diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 5d735103f..970a8851f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -672,20 +672,12 @@ class EngineArgs: ) multimodal_group.add_argument('--limit-mm-per-prompt', **multimodal_kwargs["limit_per_prompt"]) - - parser.add_argument( + multimodal_group.add_argument( '--mm-processor-kwargs', - default=None, - type=json.loads, - help=('Overrides for the multi-modal processor obtained from ' - '``AutoProcessor.from_pretrained``. The available overrides ' - 'depend on the model that is being run.' - 'For example, for Phi-3-Vision: ``{"num_crops": 4}``.')) - parser.add_argument( + **multimodal_kwargs["mm_processor_kwargs"]) + multimodal_group.add_argument( '--disable-mm-preprocessor-cache', - action='store_true', - help='If True, disable caching of the processed multi-modal ' - 'inputs.') + **multimodal_kwargs["disable_mm_preprocessor_cache"]) # LoRA related configs lora_kwargs = get_kwargs(LoRAConfig) diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 4c334ab62..d969922d5 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -101,7 +101,8 @@ class InputContext: Initialize a HuggingFace-like processor class, merging the keyword arguments with those in the model's configuration. """ - base_kwargs = self.model_config.mm_processor_kwargs + mm_config = self.model_config.get_multimodal_config() + base_kwargs = mm_config.mm_processor_kwargs if base_kwargs is None: base_kwargs = {} @@ -139,7 +140,8 @@ class InputProcessingContext(InputContext): """ assert callable(hf_processor) - base_kwargs = self.model_config.mm_processor_kwargs + mm_config = self.model_config.get_multimodal_config() + base_kwargs = mm_config.mm_processor_kwargs if base_kwargs is None: base_kwargs = {} diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index ef84becd2..95f0c29d4 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -774,8 +774,9 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): size: Optional[dict[str, int]] = None, **kwargs: object, ): - if self.ctx.model_config.mm_processor_kwargs: - kwargs.update(self.ctx.model_config.mm_processor_kwargs) + mm_config = self.ctx.model_config.get_multimodal_config() + if mm_config.mm_processor_kwargs: + kwargs.update(mm_config.mm_processor_kwargs) if min_pixels is not None: kwargs["min_pixels"] = min_pixels diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index ec4f15681..68598b941 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -262,7 +262,8 @@ class MultiModalRegistry: if tokenizer is None: tokenizer = cached_tokenizer_from_config(model_config) if disable_cache is None: - disable_cache = model_config.disable_mm_preprocessor_cache + mm_config = model_config.get_multimodal_config() + disable_cache = mm_config.disable_mm_preprocessor_cache model_cls = self._get_model_cls(model_config) factories = self._processor_factories[model_cls] diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index 4f06950c4..d27c26659 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -33,7 +33,8 @@ class HashableList(list): def _merge_mm_kwargs(model_config: "ModelConfig", **kwargs): - base_kwargs = model_config.mm_processor_kwargs + mm_config = model_config.get_multimodal_config() + base_kwargs = mm_config.mm_processor_kwargs if base_kwargs is None: base_kwargs = {} diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index c765c1bbf..64ece840f 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -33,7 +33,10 @@ from vllm.utils import is_list_of class MirroredProcessingCache: def __init__(self, model_config): - self.use_cache = not model_config.disable_mm_preprocessor_cache + mm_config = model_config.multimodal_config + disable_mm_preprocessor_cache = mm_config is not None and \ + not mm_config.disable_mm_preprocessor_cache + self.use_cache = not disable_mm_preprocessor_cache self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB, MultiModalKwargs) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index fa334302e..5c15e8bae 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -51,8 +51,7 @@ class Processor: self.mm_input_cache_client = MirroredProcessingCache(self.model_config) # Multi-modal hasher (for images) - self.use_hash = ( - not self.model_config.disable_mm_preprocessor_cache) or \ + self.use_hash = self.mm_input_cache_client.use_cache or \ self.cache_config.enable_prefix_caching def _validate_logprobs( -- GitLab From bdb2cddafc524380e0d04cf1aa32b41d9411d6fd Mon Sep 17 00:00:00 2001 From: ponix-j <55234879+ponix-j@users.noreply.github.com> Date: Tue, 29 Apr 2025 14:59:13 +0800 Subject: [PATCH 014/461] [Misc]Use a platform independent interface to obtain the device attributes (#17100) --- tests/conftest.py | 3 ++- tests/v1/sample/test_sampler.py | 4 +++- vllm/worker/multi_step_model_runner.py | 5 +++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index e62b56cb5..5fc09b241 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -293,7 +293,8 @@ class HfRunner: def get_default_device(self): from vllm.platforms import current_platform - return ("cpu" if current_platform.is_cpu() else "cuda") + return ("cpu" + if current_platform.is_cpu() else current_platform.device_type) def wrap_device(self, x: _T, device: Optional[str] = None) -> _T: if x is None or isinstance(x, (bool, )): diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py index 5f041b448..24b759bc1 100644 --- a/tests/v1/sample/test_sampler.py +++ b/tests/v1/sample/test_sampler.py @@ -6,6 +6,7 @@ import numpy as np import pytest import torch +from vllm.platforms import current_platform from vllm.utils import make_tensor_with_pad from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.sampler import Sampler @@ -13,7 +14,8 @@ from vllm.v1.sample.sampler import Sampler VOCAB_SIZE = 1024 NUM_OUTPUT_TOKENS = 20 CUDA_DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) + f"{current_platform.device_type}:{i}" + for i in range(1 if current_platform.device_count() == 1 else 2) ] MAX_NUM_PROMPT_TOKENS = 64 diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index a6f5ec825..58bf31cf2 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -14,6 +14,7 @@ from vllm.model_executor.layers.sampler import (PromptLogprobs, SampleLogprobs, SamplerOutput, SamplingMetadata, get_logprobs, get_pythonized_sample_results) +from vllm.platforms import current_platform from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors, Logprob, SequenceGroupMetadata, SequenceOutput) from vllm.utils import PyObjectCache, async_tensor_h2d, current_stream @@ -158,8 +159,8 @@ class StatefulModelInput(BroadcastableModelInput): is_first_multi_step: bool = False base_output_proc_callback: Optional[Callable] = None # ping-pong data structures for multi-step to wait on the previous step - step_cuda_events: List[torch.cuda.Event] = field( - default_factory=lambda: [torch.cuda.Event(blocking=True)] * 2) + step_cuda_events: List[current_platform.Event] = field( + default_factory=lambda: [current_platform.Event(blocking=True)] * 2) num_seqs: int = -1 num_queries: int = -1 num_single_step_prefills: int = 0 -- GitLab From 193e78e35d6f66bed5cec7414d5da2d3de777381 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hyogeun=20Oh=20=28=EC=98=A4=ED=9A=A8=EA=B7=BC=29?= Date: Tue, 29 Apr 2025 16:16:17 +0900 Subject: [PATCH 015/461] [Fix] Documentation spacing in compilation config help text (#17342) Signed-off-by: Zerohertz --- vllm/engine/arg_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 970a8851f..ad2624123 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -866,7 +866,7 @@ class EngineArgs: '-O', type=CompilationConfig.from_cli, default=None, - help='torch.compile configuration for the model.' + help='torch.compile configuration for the model. ' 'When it is a number (0, 1, 2, 3), it will be ' 'interpreted as the optimization level.\n' 'NOTE: level 0 is the default level without ' -- GitLab From 44641092197c59592c51e925c71bdf6f6c5b49ea Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Tue, 29 Apr 2025 03:17:23 -0400 Subject: [PATCH 016/461] [Build][Bugfix] Restrict setuptools version to <80 (#17320) Signed-off-by: Gregory Shtrasberg --- requirements/common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/common.txt b/requirements/common.txt index 33c4c3219..36fc791cc 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -34,7 +34,7 @@ mistral_common[opencv] >= 1.5.4 opencv-python-headless >= 4.11.0 # required for video IO pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 -setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 +setuptools>=74.1.1,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. compressed-tensors == 0.9.3 # required for compressed-tensors depyf==0.18.0 # required for profiling and debugging with compilation config -- GitLab From 97cc8729f0bc351a5536380fd897607f4ecdeef1 Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Tue, 29 Apr 2025 03:30:40 -0400 Subject: [PATCH 017/461] [Model] Ignore rotary embed load for Cohere model (#17319) --- vllm/model_executor/models/commandr.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 25b1d5a19..8f64e5d5c 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -418,6 +418,10 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant): loaded_params: Set[str] = set() for name, loaded_weight in weights: + # Skip loading rotary embeddings since vLLM has its own + if "rotary_emb.inv_freq" in name: + continue + if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): # Loading kv cache quantization scales -- GitLab From 4a5e13149a5db287baa395f0d639bb00ffdbcd25 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 29 Apr 2025 12:35:47 +0100 Subject: [PATCH 018/461] Update docs requirements (#17379) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- requirements/docs.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements/docs.txt b/requirements/docs.txt index d84fd633c..cba86b52a 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -1,10 +1,10 @@ -sphinx==6.2.1 -sphinx-argparse==0.4.0 -sphinx-book-theme==1.0.1 +sphinx==8.2.3 +sphinx-argparse==0.5.2 +sphinx-book-theme==1.1.4 sphinx-copybutton==0.5.2 sphinx-design==0.6.1 sphinx-togglebutton==0.3.2 -myst-parser==3.0.1 +myst-parser==4.0.1 msgspec cloudpickle commonmark # Required by sphinx-argparse when using :markdownhelp: -- GitLab From 890f104cdfe559136249872955e7234ea1bd298d Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Tue, 29 Apr 2025 20:38:32 +0800 Subject: [PATCH 019/461] [Doc] Fix QWen3MOE info (#17381) Signed-off-by: Jee Jee Li --- docs/source/models/supported_models.md | 4 ++-- tests/models/registry.py | 12 ++---------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 98b7d7631..5433805b6 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -542,8 +542,8 @@ See [this page](#generative-models) for more information on how to use generativ * ✅︎ - * `Qwen3MoeForCausalLM` * Qwen3MoE - * `Qwen/Qwen3-MoE-15B-A2B`, etc. - * ✅︎ + * `Qwen/Qwen3-30B-A3B`, etc. + * * ✅︎ - * `StableLmForCausalLM` * StableLM diff --git a/tests/models/registry.py b/tests/models/registry.py index a3c5bc865..8b330109d 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -217,16 +217,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-0.5B-Instruct", extras={"2.5": "Qwen/Qwen2.5-0.5B-Instruct"}), # noqa: E501 "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"), - "Qwen3ForCausalLM": _HfExamplesInfo( - "Qwen/Qwen3-8B", - is_available_online=False, - min_transformers_version="4.51" - ), - "Qwen3MoeForCausalLM": _HfExamplesInfo( - "Qwen/Qwen3-MoE-15B-A2B", - is_available_online=False, - min_transformers_version="4.51" - ), + "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"), + "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"), "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b", is_available_online=False), "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b", # noqa: E501 -- GitLab From 00ee37efa23600d7c89d8fd5dc8bdc125c49e39d Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 29 Apr 2025 20:42:16 +0800 Subject: [PATCH 020/461] [Bugfix] Clean up MiniMax-VL and fix processing (#17354) Signed-off-by: DarkLight1337 --- docs/source/models/supported_models.md | 7 + .../multimodal/processing/test_common.py | 1 + .../processing/test_minimax_vl_01.py | 1 - vllm/model_executor/models/minimax_vl_01.py | 312 ++---------------- 4 files changed, 38 insertions(+), 283 deletions(-) diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 5433805b6..95e7d5d60 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -979,6 +979,13 @@ See [this page](#generative-models) for more information on how to use generativ * ✅︎ * ✅︎ * ✅︎ +- * `MiniMaxVL01ForConditionalGeneration` + * MiniMax-VL + * T + IE+ + * `MiniMaxAI/MiniMax-VL-01`, etc. + * + * ✅︎ + * ✅︎ - * `Mistral3ForConditionalGeneration` * Mistral3 * T + I+ diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index b3c56e18b..4dc49d18c 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -270,6 +270,7 @@ def _test_processing_correctness_mistral( "openbmb/MiniCPM-Llama3-V-2_5", "openbmb/MiniCPM-o-2_6", "openbmb/MiniCPM-V-2_6", + "MiniMaxAI/MiniMax-VL-01", "allenai/Molmo-7B-D-0924", "allenai/Molmo-7B-O-0924", "nvidia/NVLM-D-72B", diff --git a/tests/models/multimodal/processing/test_minimax_vl_01.py b/tests/models/multimodal/processing/test_minimax_vl_01.py index d333c32dc..10de28ab5 100644 --- a/tests/models/multimodal/processing/test_minimax_vl_01.py +++ b/tests/models/multimodal/processing/test_minimax_vl_01.py @@ -12,7 +12,6 @@ from ...utils import build_model_context @pytest.mark.parametrize("model_id", ["MiniMaxAI/MiniMax-VL-01"]) -# yapf: enable @pytest.mark.parametrize("num_imgs", [1, 2]) def test_processor_override( image_assets: _ImageAssets, diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py index 14e105586..4ac60f97b 100644 --- a/vllm/model_executor/models/minimax_vl_01.py +++ b/vllm/model_executor/models/minimax_vl_01.py @@ -1,52 +1,32 @@ # SPDX-License-Identifier: Apache-2.0 +from collections.abc import Iterable, Mapping +from typing import Literal, Optional, Set, Tuple, TypedDict, Union, cast -from abc import abstractmethod -from collections.abc import Iterable, Mapping, Sequence -from dataclasses import dataclass -from typing import (Final, Literal, Optional, Protocol, Set, Tuple, TypedDict, - TypeVar, Union, cast) - -import numpy as np import torch import torch.nn as nn -from transformers import BatchFeature, CLIPVisionConfig, PretrainedConfig -from transformers.image_processing_utils import select_best_resolution +from transformers import BatchFeature from vllm.config import VllmConfig from vllm.jsontree import json_map_leaves -from vllm.logger import init_logger from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict -from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs -from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, - ImageSize, MultiModalDataItems) -from vllm.multimodal.processing import (BaseMultiModalProcessor, - BaseProcessingInfo, PromptReplacement, - PromptUpdate) -from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import MultiModalFieldConfig from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.minimax_vl_01 import MiniMaxVL01Config from .clip import CLIPVisionModel from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP +from .llava import (BaseLlavaMultiModalProcessor, LlavaDummyInputsBuilder, + init_vision_tower_for_llava) +from .llava_next import LlavaNextProcessingInfo from .pixtral import PixtralHFVisionModel from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) -from .vision import get_vision_encoder_info - -logger = init_logger(__name__) - - -# For dummy input only -@dataclass -class MaxImageTokenMeta: - width: int = 1024 - height: int = 1024 class MiniMaxVL01ImagePixelInputs(TypedDict): @@ -69,66 +49,8 @@ class MiniMaxVL01ImageEmbeddingInputs(TypedDict): """ -def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int): - if not isinstance(grid_pinpoints, list): - raise TypeError("grid_pinpoints should be a list of tuples or lists") - - # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, - # otherwise it will cause wrong calculate - if not isinstance(image_size, (list, tuple)): - if not isinstance(image_size, (torch.Tensor, np.ndarray)): - raise TypeError("image_size invalid type " + - f"{type(image_size)} with value {image_size}") - image_size = image_size.tolist() - - best_resolution = select_best_resolution(image_size, grid_pinpoints) - height, width = best_resolution - num_patches = 0 - # consider change to ceil(height/patch_size)*ceil(width/patch_size) + 1 - for i in range(0, height, patch_size): - for j in range(0, width, patch_size): - num_patches += 1 - # add the base patch - num_patches += 1 - return num_patches - - -def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): - if not isinstance(grid_pinpoints, list): - raise TypeError("grid_pinpoints should be a list of tuples or lists") - - # ! VERY IMPORTANT if image_size is tensor, - # must convert to into tuple, - # otherwise it will cause wrong calculate - if not isinstance(image_size, (list, tuple)): - if not isinstance(image_size, (torch.Tensor, np.ndarray)): - raise TypeError( - "image_size invalid type " + - f"{type(image_size)} not valid, " + - "should be either list, tuple, np.ndarray or tensor") - image_size = image_size.tolist() - - height, width = select_best_resolution(image_size, grid_pinpoints) - return height // patch_size, width // patch_size - - -def unpad_image(tensor, original_size): - original_height, original_width = original_size - current_height, current_width = tensor.shape[1:] - - original_aspect_ratio = original_width / original_height - current_aspect_ratio = current_width / current_height - - if original_aspect_ratio > current_aspect_ratio: - new_height = int(original_height * current_width) // original_width - padding = (current_height - new_height) // 2 - unpadded_tensor = tensor[:, padding:current_height - padding, :] - else: - new_width = int(original_width * current_height) // original_height - padding = (current_width - new_width) // 2 - unpadded_tensor = tensor[:, :, padding:current_width - padding] - - return unpadded_tensor +MiniMaxVL01ImageInputs = Union[MiniMaxVL01ImagePixelInputs, + MiniMaxVL01ImageEmbeddingInputs] class MiniMaxVL01MultiModalProjector(nn.Module): @@ -161,144 +83,29 @@ class MiniMaxVL01MultiModalProjector(nn.Module): return hidden_states -class MiniMaxVL01LikeConfig(Protocol): - vision_config: Final[PretrainedConfig] - image_token_index: Final[int] - vision_feature_select_strategy: Final[str] - vision_feature_layer: Final[Union[int, list[int]]] - - -class MiniMaxVL01LikeProcessor(Protocol): - image_token: Final[str] - +class MiniMaxVL01DummyInputsBuilder(LlavaDummyInputsBuilder): + pass -_I = TypeVar("_I", bound=BaseProcessingInfo) - -class MiniMaxVL01DummyInputsBuilder(BaseDummyInputsBuilder[_I]): - - def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: - num_images = mm_counts.get("image", 0) - processor = self.info.get_hf_processor() - image_token = processor.image_token - return image_token * num_images - - def get_dummy_mm_data( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> MultiModalDataDict: - num_images = mm_counts.get("image", 0) - - return { - "image": - self._get_dummy_images(width=MaxImageTokenMeta.width, - height=MaxImageTokenMeta.height, - num_images=num_images) - } - - -class MiniMaxVL01ProcessingInfo(BaseProcessingInfo): +class MiniMaxVL01ProcessingInfo(LlavaNextProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(MiniMaxVL01Config) - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": None} - - def get_vision_encoder_info(self): - return get_vision_encoder_info(self.get_hf_config()) + def get_hf_processor(self, **kwargs: object): + hf_processor = self.ctx.get_hf_processor(**kwargs) + image_processor = hf_processor.image_processor + image_processor.anyres_preprocess = ( + image_processor.anyres_for_vllm_preprocess) - def _apply_feature_select_strategy( - self, - strategy: str, - encoder_num_image_tokens: int, - ) -> int: - if strategy == "default": - return encoder_num_image_tokens - 1 - if strategy == "full": - return encoder_num_image_tokens - - msg = f"Unexpected feature select strategy: {strategy!r}" - raise NotImplementedError(msg) - - def get_num_image_tokens( - self, - *, - image_width: int, - image_height: int, - ) -> int: - hf_config = self.get_hf_config() - vision_encoder_info = self.get_vision_encoder_info() - - return self._apply_feature_select_strategy( - hf_config.vision_feature_select_strategy, - vision_encoder_info.get_num_image_tokens( - image_width=image_width, - image_height=image_height, - ), - ) + return hf_processor - def get_image_size_with_most_features(self) -> ImageSize: - vision_encoder_info = self.get_vision_encoder_info() - width = height = vision_encoder_info.get_image_size() - return ImageSize(width=width, height=height) - - def get_max_image_tokens(self) -> int: - target_width, target_height = self.get_image_size_with_most_features() - - return self.get_num_image_tokens( - image_width=target_width, - image_height=target_height, - ) - - -class BaseMiniMaxVL01MultiModalProcessor(BaseMultiModalProcessor[_I]): - - # Copied from BaseMultiModalProcessor - @abstractmethod - def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - raise NotImplementedError - - def _get_prompt_updates( - self, - mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, - ) -> Sequence[PromptUpdate]: - hf_config = self.info.get_hf_config() - image_token_id = hf_config.image_token_index - - def get_replacement(item_idx: int): - images = mm_items.get_items( - "image", (ImageEmbeddingItems, ImageProcessorItems)) - - if isinstance(images, ImageEmbeddingItems): - num_image_tokens = images.get_feature_size(item_idx) - else: - image_size = images.get_image_size(item_idx) - num_image_tokens = self.info.get_num_image_tokens( - image_width=image_size.width, - image_height=image_size.height, - ) - - return [image_token_id] * num_image_tokens - - return [ - PromptReplacement( - modality="image", - target=[image_token_id], - replacement=get_replacement, - ), - ] + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} class MiniMaxVL01MultiModalProcessor( - BaseMiniMaxVL01MultiModalProcessor[MiniMaxVL01ProcessingInfo]): + BaseLlavaMultiModalProcessor[MiniMaxVL01ProcessingInfo]): def _call_hf_processor( self, @@ -314,10 +121,9 @@ class MiniMaxVL01MultiModalProcessor( pixel_values = processed_outputs.get("pixel_values") if pixel_values is not None: + # Avoid padding since we need the output for each image to be + # independent of other images for the cache to work correctly image_sizes = processed_outputs["image_sizes"] - min_len = min(len(pixel_values), len(image_sizes)) - pixel_values = pixel_values[:min_len] - image_sizes = image_sizes[:min_len] assert len(pixel_values) == len(image_sizes) processed_outputs["pixel_values"] = [ @@ -337,65 +143,6 @@ class MiniMaxVL01MultiModalProcessor( } -def _get_num_hidden_layers(hf_config: MiniMaxVL01LikeConfig) -> int: - """Determine the number of hidden layers to initialize up to in the - visual encoder. - - Args: - hf_config: Model config with vision feature layer(s). - """ - feature_layers = hf_config.vision_feature_layer - num_hidden_layers = hf_config.vision_config.num_hidden_layers - # If we have one feature layer, initialize up to that layer - if isinstance(feature_layers, int): - return _get_layer_index(feature_layers, num_hidden_layers) - # If we have multiple feature layers, initialize up to the deepest one - elif isinstance(feature_layers, (list, tuple)): - return max( - _get_layer_index(idx, num_hidden_layers) for idx in feature_layers) - raise TypeError(f"vision_layer_feature type: {type(feature_layers)}" - " is not supported") - - -def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int: - """Given a signed vision feature layer, get the number of hidden layers - needed to leverage it. - - Args: - feature_layer_index: Index of a required layer in the visual encoder. - num_hidden_layers: The total number of hidden layers in the visual - encoder. - """ - if feature_layer_index < 0: - return num_hidden_layers + feature_layer_index + 1 - return feature_layer_index - - -def init_vision_tower_for_MiniMaxVL01( - hf_config: MiniMaxVL01LikeConfig, - quant_config: Optional[QuantizationConfig], - *, - require_post_norm: Optional[bool] = None, - prefix: str = "", -) -> Union[CLIPVisionModel, SiglipVisionModel, PixtralHFVisionModel]: - vision_config = hf_config.vision_config - - # Initialize the vision tower only up to the deepest required feature layer - num_hidden_layers = _get_num_hidden_layers(hf_config) - - if isinstance(vision_config, CLIPVisionConfig): - return CLIPVisionModel( - vision_config, - quant_config=quant_config, - num_hidden_layers_override=num_hidden_layers, - require_post_norm=require_post_norm, - prefix=prefix, - ) - - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - @MULTIMODAL_REGISTRY.register_processor( MiniMaxVL01MultiModalProcessor, info=MiniMaxVL01ProcessingInfo, @@ -419,7 +166,7 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, self.multimodal_config = multimodal_config # TODO: Optionally initializes this for supporting embeddings. - self.vision_tower = init_vision_tower_for_MiniMaxVL01( + self.vision_tower = init_vision_tower_for_llava( config, quant_config, require_post_norm=False, @@ -476,7 +223,8 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, def _image_pixels_to_features( self, - vision_tower: Union[CLIPVisionModel], + vision_tower: Union[CLIPVisionModel, SiglipVisionModel, + PixtralHFVisionModel], pixel_values: Union[torch.Tensor, list[torch.Tensor]], ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]: # NOTE: we skip the step to select the vision feature layer since @@ -496,7 +244,7 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, def _process_image_pixels( self, - inputs: Union[MiniMaxVL01ImagePixelInputs], + inputs: MiniMaxVL01ImagePixelInputs, ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]: assert self.vision_tower is not None @@ -506,7 +254,7 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, def _process_image_input( self, - image_input: MiniMaxVL01ImagePixelInputs, + image_input: MiniMaxVL01ImageInputs, ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]: if image_input["type"] == "image_embeds": return image_input["data"] @@ -539,7 +287,7 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, return data def _parse_and_validate_image_input( - self, **kwargs: object) -> Optional[MiniMaxVL01ImagePixelInputs]: + self, **kwargs: object) -> Optional[MiniMaxVL01ImageInputs]: pixel_values = kwargs.pop("pixel_values", None) image_embeds = kwargs.pop("image_embeds", None) -- GitLab From 40896bdf3f22d9681b1cc5831d271fcc7ea56ae8 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 29 Apr 2025 14:46:55 +0100 Subject: [PATCH 021/461] `pre-commit autoupdate` (#17380) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .pre-commit-config.yaml | 12 ++++++------ csrc/moe/marlin_kernels/marlin_moe_kernel.h | 8 ++++---- csrc/moe/marlin_moe_wna16/marlin_template.h | 8 ++++---- csrc/moe/moe_wna16_utils.h | 16 ++++++++-------- .../gptq_allspark/allspark_qgemm_w8a16.cu | 2 +- csrc/quantization/gptq_marlin/gptq_marlin.cu | 16 ++++++++-------- .../marlin/dense/marlin_cuda_kernel.cu | 4 ++-- .../marlin/qqq/marlin_qqq_gemm_kernel.cu | 4 ++-- csrc/quantization/marlin/sparse/common/mma.h | 4 ++-- 9 files changed, 37 insertions(+), 37 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f76b24c02..87681d7eb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -12,29 +12,29 @@ repos: - id: yapf args: [--in-place, --verbose] - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.9.3 + rev: v0.11.7 hooks: - id: ruff args: [--output-format, github, --fix] - repo: https://github.com/codespell-project/codespell - rev: v2.4.0 + rev: v2.4.1 hooks: - id: codespell additional_dependencies: ['tomli'] args: ['--toml', 'pyproject.toml'] - repo: https://github.com/PyCQA/isort - rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0 + rev: 6.0.1 hooks: - id: isort - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v19.1.7 + rev: v20.1.3 hooks: - id: clang-format exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*' types_or: [c++, cuda] args: [--style=file, --verbose] - repo: https://github.com/jackdewinter/pymarkdown - rev: v0.9.27 + rev: v0.9.29 hooks: - id: pymarkdown args: [fix] @@ -43,7 +43,7 @@ repos: hooks: - id: actionlint - repo: https://github.com/astral-sh/uv-pre-commit - rev: 0.6.2 + rev: 0.6.17 hooks: - id: pip-compile args: [requirements/test.in, -o, requirements/test.txt] diff --git a/csrc/moe/marlin_kernels/marlin_moe_kernel.h b/csrc/moe/marlin_kernels/marlin_moe_kernel.h index 47ecf109d..a217401b3 100644 --- a/csrc/moe/marlin_kernels/marlin_moe_kernel.h +++ b/csrc/moe/marlin_kernels/marlin_moe_kernel.h @@ -138,8 +138,8 @@ __device__ inline FragB dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); - int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408; @@ -182,8 +182,8 @@ __device__ inline FragB dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); - int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); const int SUB = 0x64006400; const int MUL = 0x2c002c00; diff --git a/csrc/moe/marlin_moe_wna16/marlin_template.h b/csrc/moe/marlin_moe_wna16/marlin_template.h index 205b308fe..3705216ca 100644 --- a/csrc/moe/marlin_moe_wna16/marlin_template.h +++ b/csrc/moe/marlin_moe_wna16/marlin_template.h @@ -209,8 +209,8 @@ __device__ inline typename ScalarType::FragB dequant( const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); - int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408; @@ -233,9 +233,9 @@ dequant(int q, // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); q >>= 4; - int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); static constexpr uint32_t MUL = 0x3F803F80; static constexpr uint32_t ADD = 0xC308C308; diff --git a/csrc/moe/moe_wna16_utils.h b/csrc/moe/moe_wna16_utils.h index 4396b8024..8ef03f0e6 100644 --- a/csrc/moe/moe_wna16_utils.h +++ b/csrc/moe/moe_wna16_utils.h @@ -108,11 +108,11 @@ __device__ inline void dequant(int q, half2* res) { const int MUL = 0x2c002c00; const int ADD = 0xd400d400; - int lo0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); - int hi0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); + int lo0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); + int hi0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); q >>= 8; - int lo1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); - int hi1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); + int lo1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); + int hi1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); res[0] = __hsub2(*reinterpret_cast(&lo0), *reinterpret_cast(&SUB)); @@ -149,13 +149,13 @@ __device__ inline void dequant(int q, nv_bfloat162* res) { static constexpr uint32_t MASK = 0x000f000f; static constexpr uint32_t EX = 0x43004300; - int lo0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); + int lo0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); q >>= 4; - int hi0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); + int hi0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); q >>= 4; - int lo1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); + int lo1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); q >>= 4; - int hi1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); + int hi1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); static constexpr uint32_t MUL = 0x3F803F80; static constexpr uint32_t ADD = 0xC300C300; diff --git a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu index ec0bf2c3c..ea3bb4299 100644 --- a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu +++ b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu @@ -347,7 +347,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK { for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) { hmma16816_f32( C_frag[m_idx][n_idx], A_frag[reg_buf_idx][m_idx], - reinterpret_cast(BF_frag[reg_buf_idx][n_idx])); + reinterpret_cast(BF_frag[reg_buf_idx][n_idx])); } } } diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu index 83bbd1e68..a974c881e 100644 --- a/csrc/quantization/gptq_marlin/gptq_marlin.cu +++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu @@ -173,8 +173,8 @@ dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); - int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408; @@ -197,9 +197,9 @@ dequant(int q) { // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); q >>= 4; - int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); typename ScalarType::FragB frag_b; static constexpr uint32_t MUL = 0x3F803F80; @@ -221,8 +221,8 @@ dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); - int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); const int SUB = 0x64006400; const int MUL = 0x2c002c00; @@ -244,9 +244,9 @@ dequant(int q) { // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); q >>= 4; - int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); typename ScalarType::FragB frag_b; static constexpr uint32_t MUL = 0x3F803F80; diff --git a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu index ba0a2410c..ea96326ed 100644 --- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu +++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu @@ -96,8 +96,8 @@ __device__ inline FragB dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); - int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408; diff --git a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu index cd1830764..c96d68d9b 100644 --- a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu +++ b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu @@ -141,8 +141,8 @@ __device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) { static constexpr uint32_t HI = 0x00f000f0; static constexpr uint32_t EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - uint32_t t0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); - uint32_t t1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); + uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); + uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. static constexpr uint32_t SUB = 0x64086408; diff --git a/csrc/quantization/marlin/sparse/common/mma.h b/csrc/quantization/marlin/sparse/common/mma.h index 49eee4128..b26505f77 100644 --- a/csrc/quantization/marlin/sparse/common/mma.h +++ b/csrc/quantization/marlin/sparse/common/mma.h @@ -127,8 +127,8 @@ __device__ inline FragB dequant_4bit(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); - int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408; -- GitLab From 88ad9ec6b23b79c358ce279b02a67e7c96e2c8b9 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 29 Apr 2025 22:03:35 +0800 Subject: [PATCH 022/461] [Frontend] Support `chat_template_kwargs` in `LLM.chat` (#17356) Signed-off-by: DarkLight1337 --- tests/entrypoints/llm/test_chat.py | 109 ++++++++++++++++++++++++----- vllm/entrypoints/llm.py | 21 +++--- 2 files changed, 106 insertions(+), 24 deletions(-) diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py index 6a4862123..742a66683 100644 --- a/tests/entrypoints/llm/test_chat.py +++ b/tests/entrypoints/llm/test_chat.py @@ -1,15 +1,31 @@ # SPDX-License-Identifier: Apache-2.0 +import weakref import pytest from vllm import LLM +from vllm.distributed import cleanup_dist_env_and_memory from ..openai.test_vision import TEST_IMAGE_URLS -def test_chat(): - llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct") +@pytest.fixture(scope="function") +def text_llm(): + # pytest caches the fixture so we use weakref.proxy to + # enable garbage collection + llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", + enforce_eager=True, + seed=0) + with llm.deprecate_legacy_api(): + yield weakref.proxy(llm) + + del llm + + cleanup_dist_env_and_memory() + + +def test_chat(text_llm): prompt1 = "Explain the concept of entropy." messages = [ { @@ -21,13 +37,11 @@ def test_chat(): "content": prompt1 }, ] - outputs = llm.chat(messages) + outputs = text_llm.chat(messages) assert len(outputs) == 1 -def test_multi_chat(): - llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct") - +def test_multi_chat(text_llm): prompt1 = "Explain the concept of entropy." prompt2 = "Explain what among us is." @@ -55,13 +69,14 @@ def test_multi_chat(): messages = [conversation1, conversation2] - outputs = llm.chat(messages) + outputs = text_llm.chat(messages) assert len(outputs) == 2 -@pytest.mark.parametrize("image_urls", - [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]]) -def test_chat_multi_image(image_urls: list[str]): +@pytest.fixture(scope="function") +def vision_llm(): + # pytest caches the fixture so we use weakref.proxy to + # enable garbage collection llm = LLM( model="microsoft/Phi-3.5-vision-instruct", max_model_len=4096, @@ -69,8 +84,20 @@ def test_chat_multi_image(image_urls: list[str]): enforce_eager=True, trust_remote_code=True, limit_mm_per_prompt={"image": 2}, + seed=0, ) + with llm.deprecate_legacy_api(): + yield weakref.proxy(llm) + + del llm + + cleanup_dist_env_and_memory() + + +@pytest.mark.parametrize("image_urls", + [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]]) +def test_chat_multi_image(vision_llm, image_urls: list[str]): messages = [{ "role": "user", @@ -87,16 +114,15 @@ def test_chat_multi_image(image_urls: list[str]): }, ], }] - outputs = llm.chat(messages) + outputs = vision_llm.chat(messages) assert len(outputs) >= 0 -def test_llm_chat_tokenization_no_double_bos(): +def test_llm_chat_tokenization_no_double_bos(text_llm): """ LLM.chat() should not add special tokens when using chat templates. Check we get a single BOS token for llama chat. """ - llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", enforce_eager=True) messages = [ { "role": "system", @@ -107,13 +133,64 @@ def test_llm_chat_tokenization_no_double_bos(): "content": "Hello!" }, ] - outputs = llm.chat(messages) + outputs = text_llm.chat(messages) assert len(outputs) == 1 - prompt_token_ids = getattr(outputs[0], "prompt_token_ids", None) + + prompt_token_ids = outputs[0].prompt_token_ids assert prompt_token_ids is not None - bos_token = llm.get_tokenizer().bos_token_id + bos_token = text_llm.get_tokenizer().bos_token_id # Ensure we have a single BOS assert prompt_token_ids[0] == bos_token assert prompt_token_ids[1] != bos_token, "Double BOS" + + +@pytest.fixture(scope="function") +def thinking_llm(): + # pytest caches the fixture so we use weakref.proxy to + # enable garbage collection + llm = LLM( + model="Qwen/Qwen3-0.6B", + max_model_len=4096, + enforce_eager=True, + seed=0, + ) + + with llm.deprecate_legacy_api(): + yield weakref.proxy(llm) + + del llm + + cleanup_dist_env_and_memory() + + +@pytest.mark.parametrize("enable_thinking", [True, False]) +def test_chat_extra_kwargs(thinking_llm, enable_thinking): + messages = [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": "What is 1+1?" + }, + ] + + outputs = thinking_llm.chat( + messages, + chat_template_kwargs={"enable_thinking": enable_thinking}, + ) + assert len(outputs) == 1 + + prompt_token_ids = outputs[0].prompt_token_ids + assert prompt_token_ids is not None + + think_id = thinking_llm.get_tokenizer().get_vocab()[""] + + if enable_thinking: + assert think_id not in prompt_token_ids + else: + # The chat template includes dummy thinking process + assert think_id in prompt_token_ids diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 653e61a11..948e8f36e 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -656,6 +656,7 @@ class LLM: add_generation_prompt: bool = True, continue_final_message: bool = False, tools: Optional[list[dict[str, Any]]] = None, + chat_template_kwargs: Optional[dict[str, Any]] = None, mm_processor_kwargs: Optional[dict[str, Any]] = None, ) -> list[RequestOutput]: """ @@ -696,6 +697,8 @@ class LLM: continue_final_message: If True, continues the final message in the conversation instead of starting a new one. Cannot be ``True`` if ``add_generation_prompt`` is also ``True``. + chat_template_kwargs: Additional kwargs to pass to the chat + template. mm_processor_kwargs: Multimodal processor kwarg overrides for this chat request. Only used for offline requests. @@ -726,6 +729,14 @@ class LLM: trust_remote_code=model_config.trust_remote_code, ) + _chat_template_kwargs: dict[str, Any] = dict( + chat_template=chat_template, + add_generation_prompt=add_generation_prompt, + continue_final_message=continue_final_message, + tools=tools, + ) + _chat_template_kwargs.update(chat_template_kwargs or {}) + prompts: list[Union[TokensPrompt, TextPrompt]] = [] for msgs in list_of_messages: @@ -743,20 +754,14 @@ class LLM: prompt_token_ids = apply_mistral_chat_template( tokenizer, messages=msgs, - chat_template=chat_template, - tools=tools, - add_generation_prompt=add_generation_prompt, - continue_final_message=continue_final_message, + **_chat_template_kwargs, ) else: prompt_str = apply_hf_chat_template( tokenizer, trust_remote_code=model_config.trust_remote_code, conversation=conversation, - chat_template=chat_template, - tools=tools, - add_generation_prompt=add_generation_prompt, - continue_final_message=continue_final_message, + **_chat_template_kwargs, ) # Special tokens are already included in chat templates so # should not be added by the tokenizer in this case. -- GitLab From 900edfa8d4081b99ac6891c6b3cc65076ee3f9af Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 29 Apr 2025 17:08:03 +0100 Subject: [PATCH 023/461] Transformers backend tweaks (#17365) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/transformers.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index ad7c07dc8..7b946ad6a 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -15,7 +15,6 @@ # limitations under the License. """Wrapper around `transformers` models""" import re -from itertools import chain from typing import Iterable, Literal, Optional, Union import torch @@ -166,12 +165,9 @@ class TransformersModel(nn.Module): # Initialize buffers (e.g. rotary embedding inverse frequency) self.init_buffers(self.model) - # Initialize parameters + # Initialize any parameters that have not had their modules replaced self.init_parameters(self.model) - # Move remaining meta tensors to device (should happen last) - self.meta_to_empty(self.model) - self.make_empty_intermediate_tensors = ( make_empty_intermediate_tensors_factory(["hidden_states"], config.hidden_size)) @@ -296,6 +292,14 @@ class TransformersModel(nn.Module): """ for name, buffer in module.named_buffers(recurse=False): if buffer.device == torch.device("meta"): + if module == self.model: + logger.warning( + "To initialize buffers correctly, we instantiate the " + "parent module and and extract the value of the " + "buffer from it. In this case, the parent module is " + "the base model. Instantiating the entire model here " + "risks GPU OOM. Could this buffer be moved to a child " + "module?") new_buffer = getattr(type(module)(self.config), name) setattr(module, name, new_buffer) for child in module.children(): @@ -320,14 +324,6 @@ class TransformersModel(nn.Module): for child in module.children(): self.init_parameters(child) - def meta_to_empty(self, module: nn.Module): - tensors = list(chain(module.buffers(), module.parameters())) - if tensors and all(t.device == torch.device("meta") for t in tensors): - module.to_empty(device=self.device_config.device) - return # We can stop recursing because to_empty is recursive - for child in module.children(): - self.meta_to_empty(child) - def get_input_embeddings(self) -> nn.Module: return self.model.get_input_embeddings() -- GitLab From 0ed27ef66ca7d7fec3c9bb8e33709ade1c73b4ad Mon Sep 17 00:00:00 2001 From: a2q1p Date: Wed, 30 Apr 2025 00:23:39 +0800 Subject: [PATCH 024/461] Fix: Spelling of inference (#17387) --- .../quantization/compressed_tensors/compressed_tensors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 7b0032572..5be6b22c7 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -384,7 +384,7 @@ class CompressedTensorsConfig(QuantizationConfig): Detect whether a layer_name is found in any target and use the quantization scheme corresponding to the matched target - to select the CompressedTensorsScheme used for infernece. + to select the CompressedTensorsScheme used for inference. """ # Find the "target" in the compressed-tensors config -- GitLab From 2ef5d106bbf269563889308039ab10b149b57008 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 29 Apr 2025 17:25:08 +0100 Subject: [PATCH 025/461] Improve literal dataclass field conversion to argparse argument (#17391) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/engine/test_arg_utils.py | 34 +++++++++++++++++++++++++++++---- tests/test_config.py | 35 +++++++++++++++++++++++++++++++++- vllm/config.py | 19 ++++++++++++++---- vllm/engine/arg_utils.py | 27 +++++++++++++++++--------- 4 files changed, 97 insertions(+), 18 deletions(-) diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 052d5793c..2c8665802 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -11,7 +11,8 @@ import pytest from vllm.config import PoolerConfig, config from vllm.engine.arg_utils import (EngineArgs, contains_type, get_kwargs, get_type, is_not_builtin, is_type, - nullable_kvs, optional_type) + literal_to_kwargs, nullable_kvs, + optional_type) from vllm.utils import FlexibleArgumentParser @@ -71,6 +72,21 @@ def test_get_type(type_hints, type, expected): assert get_type(type_hints, type) == expected +@pytest.mark.parametrize(("type_hints", "expected"), [ + ({Literal[1, 2]}, { + "type": int, + "choices": [1, 2] + }), + ({Literal[1, "a"]}, Exception), +]) +def test_literal_to_kwargs(type_hints, expected): + context = nullcontext() + if expected is Exception: + context = pytest.raises(expected) + with context: + assert literal_to_kwargs(type_hints) == expected + + @config @dataclass class DummyConfigClass: @@ -81,11 +97,15 @@ class DummyConfigClass: optional_literal: Optional[Literal["x", "y"]] = None """Optional literal with default None""" tuple_n: tuple[int, ...] = field(default_factory=lambda: (1, 2, 3)) - """Tuple with default (1, 2, 3)""" + """Tuple with variable length""" tuple_2: tuple[int, int] = field(default_factory=lambda: (1, 2)) - """Tuple with default (1, 2)""" + """Tuple with fixed length""" list_n: list[int] = field(default_factory=lambda: [1, 2, 3]) - """List with default [1, 2, 3]""" + """List with variable length""" + list_literal: list[Literal[1, 2]] = field(default_factory=list) + """List with literal choices""" + literal_literal: Literal[Literal[1], Literal[2]] = 1 + """Literal of literals with default 1""" @pytest.mark.parametrize(("type_hint", "expected"), [ @@ -111,6 +131,12 @@ def test_get_kwargs(): # lists should work assert kwargs["list_n"]["type"] is int assert kwargs["list_n"]["nargs"] == "+" + # lists with literals should have the correct choices + assert kwargs["list_literal"]["type"] is int + assert kwargs["list_literal"]["nargs"] == "+" + assert kwargs["list_literal"]["choices"] == [1, 2] + # literals of literals should have merged choices + assert kwargs["literal_literal"]["choices"] == [1, 2] @pytest.mark.parametrize(("arg", "expected"), [ diff --git a/tests/test_config.py b/tests/test_config.py index 53db91e81..2e5da8128 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,14 +1,47 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import MISSING, Field, asdict, dataclass, field +from typing import Literal, Union import pytest -from vllm.config import ModelConfig, PoolerConfig, get_field +from vllm.config import ModelConfig, PoolerConfig, config, get_field from vllm.model_executor.layers.pooler import PoolingType from vllm.platforms import current_platform +class TestConfig1: + pass + + +@dataclass +class TestConfig2: + a: int + """docstring""" + + +@dataclass +class TestConfig3: + a: int = 1 + + +@dataclass +class TestConfig4: + a: Union[Literal[1], Literal[2]] = 1 + """docstring""" + + +@pytest.mark.parametrize(("test_config", "expected_error"), [ + (TestConfig1, "must be a dataclass"), + (TestConfig2, "must have a default"), + (TestConfig3, "must have a docstring"), + (TestConfig4, "must use a single Literal"), +]) +def test_config(test_config, expected_error): + with pytest.raises(Exception, match=expected_error): + config(test_config) + + def test_get_field(): @dataclass diff --git a/vllm/config.py b/vllm/config.py index c1c72846d..8f927835d 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -17,7 +17,7 @@ from dataclasses import (MISSING, dataclass, field, fields, is_dataclass, from importlib.util import find_spec from pathlib import Path from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Final, Literal, - Optional, Protocol, TypeVar, Union, get_args) + Optional, Protocol, TypeVar, Union, get_args, get_origin) import torch from pydantic import BaseModel, Field, PrivateAttr @@ -177,9 +177,19 @@ def config(cls: ConfigT) -> ConfigT: raise ValueError( f"Field '{f.name}' in {cls.__name__} must have a default value." ) + if f.name not in attr_docs: raise ValueError( f"Field '{f.name}' in {cls.__name__} must have a docstring.") + + if get_origin(f.type) is Union: + args = get_args(f.type) + literal_args = [arg for arg in args if get_origin(arg) is Literal] + if len(literal_args) > 1: + raise ValueError( + f"Field '{f.name}' in {cls.__name__} must use a single " + "Literal type. Please use 'Literal[Literal1, Literal2]' " + "instead of 'Union[Literal1, Literal2]'.") return cls @@ -3166,6 +3176,8 @@ def get_served_model_name(model: str, GuidedDecodingBackendV0 = Literal["auto", "outlines", "lm-format-enforcer", "xgrammar", "guidance"] GuidedDecodingBackendV1 = Literal["auto", "xgrammar", "guidance"] +GuidedDecodingBackend = Literal[GuidedDecodingBackendV0, + GuidedDecodingBackendV1] @config @@ -3173,9 +3185,8 @@ GuidedDecodingBackendV1 = Literal["auto", "xgrammar", "guidance"] class DecodingConfig: """Dataclass which contains the decoding strategy of the engine.""" - guided_decoding_backend: Union[ - GuidedDecodingBackendV0, - GuidedDecodingBackendV1] = "auto" if envs.VLLM_USE_V1 else "xgrammar" + guided_decoding_backend: GuidedDecodingBackend = \ + "auto" if envs.VLLM_USE_V1 else "xgrammar" """Which engine will be used for guided decoding (JSON schema / regex etc) by default. With "auto", we will make opinionated choices based on request contents and what the backend libraries currently support, so the behavior diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ad2624123..fe688025f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -116,6 +116,18 @@ def get_type(type_hints: set[TypeHint], type: TypeHintT) -> TypeHintT: return next((th for th in type_hints if is_type(th, type)), None) +def literal_to_kwargs(type_hints: set[TypeHint]) -> dict[str, Any]: + """Convert Literal type hints to argparse kwargs.""" + type_hint = get_type(type_hints, Literal) + choices = get_args(type_hint) + choice_type = type(choices[0]) + if not all(isinstance(choice, choice_type) for choice in choices): + raise ValueError( + "All choices must be of the same type. " + f"Got {choices} with types {[type(c) for c in choices]}") + return {"type": choice_type, "choices": sorted(choices)} + + def is_not_builtin(type_hint: TypeHint) -> bool: """Check if the class is not a built-in type.""" return type_hint.__module__ != "builtins" @@ -151,15 +163,7 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]: # Creates --no- and -- flags kwargs[name]["action"] = argparse.BooleanOptionalAction elif contains_type(type_hints, Literal): - # Creates choices from Literal arguments - type_hint = get_type(type_hints, Literal) - choices = sorted(get_args(type_hint)) - kwargs[name]["choices"] = choices - choice_type = type(choices[0]) - assert all(type(c) is choice_type for c in choices), ( - "All choices must be of the same type. " - f"Got {choices} with types {[type(c) for c in choices]}") - kwargs[name]["type"] = choice_type + kwargs[name].update(literal_to_kwargs(type_hints)) elif contains_type(type_hints, tuple): type_hint = get_type(type_hints, tuple) types = get_args(type_hint) @@ -191,6 +195,11 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]: raise ValueError( f"Unsupported type {type_hints} for argument {name}.") + # If the type hint was a sequence of literals, use the helper function + # to update the type and choices + if get_origin(kwargs[name].get("type")) is Literal: + kwargs[name].update(literal_to_kwargs({kwargs[name]["type"]})) + # If None is in type_hints, make the argument optional. # But not if it's a bool, argparse will handle this better. if type(None) in type_hints and not contains_type(type_hints, bool): -- GitLab From 24e6ad3f16d59005cdfc4de6c7bdeb4359b5d21c Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Wed, 30 Apr 2025 00:28:41 +0800 Subject: [PATCH 026/461] [V1] Remove num_input_tokens from attn_metadata (#17193) Signed-off-by: Chen Zhang --- vllm/forward_context.py | 16 +++++++--------- vllm/v1/attention/backends/flash_attn.py | 3 --- vllm/v1/attention/backends/flashinfer.py | 3 --- vllm/v1/attention/backends/mla/common.py | 3 --- vllm/v1/worker/gpu_model_runner.py | 5 +++-- vllm/v1/worker/tpu_model_runner.py | 5 ++++- 6 files changed, 14 insertions(+), 21 deletions(-) diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 06790d8ee..c75d8f088 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -74,15 +74,13 @@ def set_forward_context(attn_metadata: Any, if vllm_config.parallel_config.data_parallel_size > 1: dp_size = vllm_config.parallel_config.data_parallel_size dp_rank = vllm_config.parallel_config.data_parallel_rank - if attn_metadata is not None: - if hasattr(attn_metadata, "num_prefill_tokens"): - # for v0 attention backends - batchsize = attn_metadata.num_prefill_tokens + \ - attn_metadata.num_decode_tokens - else: - # for v1 attention backends - batchsize = attn_metadata.num_input_tokens + if attn_metadata is not None and hasattr(attn_metadata, + "num_prefill_tokens"): + # for v0 attention backends + batchsize = attn_metadata.num_prefill_tokens + \ + attn_metadata.num_decode_tokens else: + # for v1 attention backends or no attn_metadata batchsize = num_tokens num_tokens_across_dp = [0] * dp_size num_tokens_across_dp[dp_rank] = batchsize @@ -124,7 +122,7 @@ def set_forward_context(attn_metadata: Any, attn_metadata.num_decode_tokens else: # for v1 attention backends - batchsize = attn_metadata.num_input_tokens + batchsize = num_tokens # we use synchronous scheduling right now, # adding a sync point here should not affect # scheduling of the next batch diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 41bb9aba2..217dcd7c3 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -94,9 +94,6 @@ class FlashAttentionMetadata: scheduler_metadata: Optional[torch.Tensor] = None prefix_scheduler_metadata: Optional[torch.Tensor] = None - # For logging. - num_input_tokens: int = 0 # Number of tokens including padding. - # for local attention @dataclass class LocalAttentionMetadata: diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index bce446bd2..6e964b471 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -183,9 +183,6 @@ class FlashInferMetadata: decode_wrapper: Optional[BatchDecodeWithPagedKVCacheWrapper] = None cascade_wrapper: Optional[MultiLevelCascadeAttentionWrapper] = None - # For logging. - num_input_tokens: int = 0 # Number of tokens including padding. - @property def query_start_loc(self): # The GPUModelRunner expects to be able to access this property. diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index b032006d1..fd3be901f 100644 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -312,9 +312,6 @@ class MLACommonMetadata(Generic[D]): num_decode_tokens: int num_prefills: int - # For logging. - num_input_tokens: int = 0 # Number of tokens including padding. - # The dimension of the attention heads head_dim: Optional[int] = None diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e3d8b94fe..4711beadb 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1036,7 +1036,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): num_input_tokens = round_up(num_scheduled_tokens, tp_size) else: num_input_tokens = num_scheduled_tokens - attn_metadata.num_input_tokens = num_input_tokens # _prepare_inputs may reorder the batch, so we must gather multi # modal outputs after that to ensure the correct order @@ -1088,7 +1087,9 @@ class GPUModelRunner(LoRAModelRunnerMixin): # Run the decoder. # Use persistent buffers for CUDA graphs. - with set_forward_context(attn_metadata, self.vllm_config): + with set_forward_context(attn_metadata, + self.vllm_config, + num_tokens=num_input_tokens): output = self.model( input_ids=input_ids, positions=positions, diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 67f8af29d..d716542f7 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -769,7 +769,10 @@ class TPUModelRunner: xm.mark_step() num_reqs = self.input_batch.num_reqs # Run the decoder - with set_forward_context(attn_metadata, self.vllm_config): + with set_forward_context( + attn_metadata, + self.vllm_config, + num_tokens=scheduler_output.total_num_scheduled_tokens): hidden_states = self.model( input_ids=input_ids, positions=self.position_ids, -- GitLab From a39203f99ed426eef8b55927cb8f8668644d0a86 Mon Sep 17 00:00:00 2001 From: mofanke <54242816+mofanke@users.noreply.github.com> Date: Wed, 30 Apr 2025 00:32:40 +0800 Subject: [PATCH 027/461] =?UTF-8?q?[Bugfix]=20add=20qwen3=20reasoning-pars?= =?UTF-8?q?er=20fix=20content=20is=20None=20when=20disable=20=E2=80=A6=20(?= =?UTF-8?q?#17369)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: mofanke --- docs/source/features/reasoning_outputs.md | 1 + .../reasoning/test_qwen3_reasoning_parser.py | 141 ++++++++++++++++++ vllm/reasoning/__init__.py | 2 + vllm/reasoning/qwen3_reasoning_parser.py | 138 +++++++++++++++++ 4 files changed, 282 insertions(+) create mode 100644 tests/reasoning/test_qwen3_reasoning_parser.py create mode 100644 vllm/reasoning/qwen3_reasoning_parser.py diff --git a/docs/source/features/reasoning_outputs.md b/docs/source/features/reasoning_outputs.md index 3a0be69f8..323bf849a 100644 --- a/docs/source/features/reasoning_outputs.md +++ b/docs/source/features/reasoning_outputs.md @@ -15,6 +15,7 @@ vLLM currently supports the following reasoning models: | [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ | | [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ | | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ | +| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | ✅ | - IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`. diff --git a/tests/reasoning/test_qwen3_reasoning_parser.py b/tests/reasoning/test_qwen3_reasoning_parser.py new file mode 100644 index 000000000..95b7460d3 --- /dev/null +++ b/tests/reasoning/test_qwen3_reasoning_parser.py @@ -0,0 +1,141 @@ +# SPDX-License-Identifier: Apache-2.0 + +import pytest +from transformers import AutoTokenizer + +from tests.reasoning.utils import run_reasoning_extraction +from vllm.reasoning import ReasoningParser, ReasoningParserManager + +parser_name = "qwen3" +start_token = "" +end_token = "" + +REASONING_MODEL_NAME = "Qwen/Qwen3-0.6B" + + +@pytest.fixture(scope="module") +def qwen3_tokenizer(): + return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME) + + +# 带 ,非stream +WITH_THINK = { + "output": "This is a reasoning sectionThis is the rest", + "reasoning_content": "This is a reasoning section", + "content": "This is the rest", +} +# 带 ,stream +WITH_THINK_STREAM = { + "output": "This is a reasoning sectionThis is the rest", + "reasoning_content": "This is a reasoning section", + "content": "This is the rest", +} +# 不带 ,非stream +WITHOUT_THINK = { + "output": "This is the rest", + "reasoning_content": None, + "content": "This is the rest", +} +# 不带 ,stream +WITHOUT_THINK_STREAM = { + "output": "This is the rest", + "reasoning_content": None, + "content": "This is the rest", +} + +COMPLETE_REASONING = { + "output": "This is a reasoning section", + "reasoning_content": "This is a reasoning section", + "content": None, +} +MULTILINE_REASONING = { + "output": + "This is a reasoning\nsectionThis is the rest\nThat", + "reasoning_content": "This is a reasoning\nsection", + "content": "This is the rest\nThat", +} +ONLY_OPEN_TAG = { + "output": "This is a reasoning section", + "reasoning_content": None, + "content": "This is a reasoning section", +} + +ONLY_OPEN_TAG_STREAM = { + "output": "This is a reasoning section", + "reasoning_content": "This is a reasoning section", + "content": None, +} + +TEST_CASES = [ + pytest.param( + False, + WITH_THINK, + id="with_think", + ), + pytest.param( + True, + WITH_THINK_STREAM, + id="with_think_stream", + ), + pytest.param( + False, + WITHOUT_THINK, + id="without_think", + ), + pytest.param( + True, + WITHOUT_THINK_STREAM, + id="without_think_stream", + ), + pytest.param( + False, + COMPLETE_REASONING, + id="complete_reasoning", + ), + pytest.param( + True, + COMPLETE_REASONING, + id="complete_reasoning_stream", + ), + pytest.param( + False, + MULTILINE_REASONING, + id="multiline_reasoning", + ), + pytest.param( + True, + MULTILINE_REASONING, + id="multiline_reasoning_stream", + ), + pytest.param( + False, + ONLY_OPEN_TAG, + id="only_open_tag", + ), + pytest.param( + True, + ONLY_OPEN_TAG_STREAM, + id="only_open_tag_stream", + ), +] + + +@pytest.mark.parametrize("streaming, param_dict", TEST_CASES) +def test_reasoning( + streaming: bool, + param_dict: dict, + qwen3_tokenizer, +): + output = qwen3_tokenizer.tokenize(param_dict["output"]) + output_tokens: list[str] = [ + qwen3_tokenizer.convert_tokens_to_string([token]) for token in output + ] + parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser( + parser_name)(qwen3_tokenizer) + + reasoning, content = run_reasoning_extraction(parser, + output_tokens, + streaming=streaming) + + assert reasoning == param_dict["reasoning_content"] + assert content == param_dict["content"] diff --git a/vllm/reasoning/__init__.py b/vllm/reasoning/__init__.py index 45132a780..65606ce55 100644 --- a/vllm/reasoning/__init__.py +++ b/vllm/reasoning/__init__.py @@ -3,10 +3,12 @@ from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser from .granite_reasoning_parser import GraniteReasoningParser +from .qwen3_reasoning_parser import Qwen3ReasoningParser __all__ = [ "ReasoningParser", "ReasoningParserManager", "DeepSeekR1ReasoningParser", "GraniteReasoningParser", + "Qwen3ReasoningParser", ] diff --git a/vllm/reasoning/qwen3_reasoning_parser.py b/vllm/reasoning/qwen3_reasoning_parser.py new file mode 100644 index 000000000..78a73011f --- /dev/null +++ b/vllm/reasoning/qwen3_reasoning_parser.py @@ -0,0 +1,138 @@ +# SPDX-License-Identifier: Apache-2.0 + +import re +from collections.abc import Sequence +from typing import Optional, Union + +from transformers import PreTrainedTokenizerBase + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaMessage) +from vllm.logger import init_logger +from vllm.reasoning import ReasoningParser, ReasoningParserManager + +logger = init_logger(__name__) + + +@ReasoningParserManager.register_module("qwen3") +class Qwen3ReasoningParser(ReasoningParser): + """ + Reasoning parser for the Qwen3 model. + + The Qwen3 model uses ... tokens to denote reasoning text + within its output. The model provides a strict switch to disable reasoning + output via the 'enable_thinking=False' parameter. This parser extracts the + reasoning content enclosed by and tokens from the model's + output. + """ + + def __init__(self, tokenizer: PreTrainedTokenizerBase): + super().__init__(tokenizer) + self.think_start_token = "" + self.think_end_token = "" + + self.reasoning_regex = re.compile( + rf"{self.think_start_token}(.*?){self.think_end_token}", re.DOTALL) + + if not self.model_tokenizer: + raise ValueError( + "The model tokenizer must be passed to the ReasoningParser " + "constructor during construction.") + + self.think_start_token_id = self.vocab.get(self.think_start_token) + self.think_end_token_id = self.vocab.get(self.think_end_token) + if (self.think_start_token_id is None + or self.think_end_token_id is None): + raise RuntimeError( + "Qwen3 reasoning parser could not locate think start/end " + "tokens in the tokenizer!") + + def extract_reasoning_content_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + ) -> Union[DeltaMessage, None]: + """ + Extract reasoning content from a delta message. + Handles streaming output where previous + delta = current. + Uses token IDs for faster processing. + For text abcxyz: + - 'abc' goes to reasoning_content + - 'xyz' goes to content + """ + # Skip single special tokens + if len(delta_token_ids) == 1 and (delta_token_ids[0] in [ + self.think_start_token_id, self.think_end_token_id + ]): + return None + + if self.think_start_token_id in previous_token_ids: + if self.think_end_token_id in delta_token_ids: + # in previous, in delta, + # extract reasoning content + end_index = delta_text.find(self.think_end_token) + reasoning_content = delta_text[:end_index] + content = delta_text[end_index + len(self.think_end_token):] + return DeltaMessage(reasoning_content=reasoning_content, + content=content if content else None) + elif self.think_end_token_id in previous_token_ids: + # in previous, in previous, + # reasoning content continues + return DeltaMessage(content=delta_text) + else: + # in previous, no in previous or delta, + # reasoning content continues + return DeltaMessage(reasoning_content=delta_text) + elif self.think_start_token_id in delta_token_ids: + logger.info(delta_text) + if self.think_end_token_id in delta_token_ids: + # in delta, in delta, extract reasoning content + start_index = delta_text.find(self.think_start_token) + end_index = delta_text.find(self.think_end_token) + reasoning_content = delta_text[start_index + + len(self.think_start_token + ):end_index] + content = delta_text[end_index + len(self.think_end_token):] + return DeltaMessage(reasoning_content=reasoning_content, + content=content if content else None) + else: + # in delta, no in delta, + # reasoning content continues + return DeltaMessage(reasoning_content=delta_text) + else: + # thinking is disabled, just content + return DeltaMessage(content=delta_text) + + def extract_reasoning_content( + self, model_output: str, request: ChatCompletionRequest + ) -> tuple[Optional[str], Optional[str]]: + + # Check if the model output contains the tokens. + if (self.think_start_token not in model_output + or self.think_end_token not in model_output): + return None, model_output + else: + # Use a regex to find the reasoning content + reasoning_content = self.reasoning_regex.findall(model_output)[0] + + # Remove the reasoning content from the model output + # Although token is always at the + # beginning of the line, we cannot guarantee that the + # other models will follow this convention. + # Therefore, we need to add :start_index. + start_index = model_output.find(self.think_start_token) + if start_index != -1: + end_index = start_index + len( + f"{self.think_start_token}{reasoning_content}{self.think_end_token}" + ) + model_output = model_output[:start_index] + \ + model_output[end_index:] + + if len(model_output) == 0: + return reasoning_content, None + + return reasoning_content, model_output -- GitLab From d3cf61b89bc53aa7709932ab43e7630b9a71f2b3 Mon Sep 17 00:00:00 2001 From: Qiming Zhang Date: Tue, 29 Apr 2025 09:40:25 -0700 Subject: [PATCH 028/461] fix gemma3 results all zero (#17364) Signed-off-by: mayuyuace --- vllm/model_executor/layers/layernorm.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 75a5317b1..87d9b959e 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -241,7 +241,10 @@ class GemmaRMSNorm(CustomOp): """PyTorch-native implementation equivalent to forward().""" orig_dtype = x.dtype if residual is not None: - x = x + residual + if orig_dtype == torch.float16: + x = x + residual.float() + else: + x = x + residual residual = x x = x.float() -- GitLab From 06ffc7e1d35b3f754e46439babfed564822bbb75 Mon Sep 17 00:00:00 2001 From: TY-AMD Date: Wed, 30 Apr 2025 01:26:42 +0800 Subject: [PATCH 029/461] [Misc][ROCm] Exclude `cutlass_mla_decode` for ROCm build (#17289) Signed-off-by: Tianyuan Wu --- csrc/torch_bindings.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index c9a120976..b595b0aa6 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -130,13 +130,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ") -> ()"); ops.impl("advance_step_flashinfer", torch::kCUDA, &advance_step_flashinfer); - // Compute MLA decode using cutlass. - ops.def( - "cutlass_mla_decode(Tensor! out, Tensor q_nope, Tensor q_pe," - " Tensor kv_c_and_k_pe_cache, Tensor seq_lens," - " Tensor page_table, float scale) -> ()"); - ops.impl("cutlass_mla_decode", torch::kCUDA, &cutlass_mla_decode); - // Layernorm // Apply Root Mean Square (RMS) Normalization to the input tensor. ops.def( @@ -450,6 +443,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("cutlass_sparse_compress(Tensor a) -> Tensor[]"); ops.impl("cutlass_sparse_compress", &cutlass_sparse_compress); + // CUTLASS MLA decode + ops.def( + "cutlass_mla_decode(Tensor! out, Tensor q_nope, Tensor q_pe," + " Tensor kv_c_and_k_pe_cache, Tensor seq_lens," + " Tensor page_table, float scale) -> ()"); + ops.impl("cutlass_mla_decode", torch::kCUDA, &cutlass_mla_decode); + // Mamba selective scan kernel ops.def( "selective_scan_fwd(Tensor! u, Tensor! delta," -- GitLab From 608968b7c5709dc74754cb09c72a5793dec7af2b Mon Sep 17 00:00:00 2001 From: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com> Date: Tue, 29 Apr 2025 12:27:27 -0500 Subject: [PATCH 030/461] Enabling multi-group kernel tests. (#17115) Signed-off-by: Alexei V. Ivanov --- .../scripts/hardware_ci/run-amd-test.sh | 74 +++++++++++-------- .buildkite/test-pipeline.yaml | 5 ++ 2 files changed, 49 insertions(+), 30 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index 368f30434..d29903bf4 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -75,37 +75,51 @@ HF_MOUNT="/root/.cache/huggingface" commands=$@ echo "Commands:$commands" #ignore certain kernels tests -if [[ $commands == *" kernels "* ]]; then +if [[ $commands == *" kernels/core"* ]]; then commands="${commands} \ - --ignore=kernels/test_attention_selector.py \ - --ignore=kernels/test_blocksparse_attention.py \ - --ignore=kernels/test_causal_conv1d.py \ - --ignore=kernels/test_cutlass.py \ - --ignore=kernels/test_encoder_decoder_attn.py \ - --ignore=kernels/test_flash_attn.py \ - --ignore=kernels/test_flashinfer.py \ - --ignore=kernels/test_int8_quant.py \ - --ignore=kernels/test_machete_gemm.py \ - --ignore=kernels/test_mamba_ssm.py \ - --ignore=kernels/test_marlin_gemm.py \ - --ignore=kernels/test_moe.py \ - --ignore=kernels/test_prefix_prefill.py \ - --ignore=kernels/test_rand.py \ - --ignore=kernels/test_sampler.py \ - --ignore=kernels/test_cascade_flash_attn.py \ - --ignore=kernels/test_mamba_mixer2.py \ - --ignore=kernels/test_aqlm.py \ - --ignore=kernels/test_machete_mm.py \ - --ignore=kernels/test_mha_attn.py \ - --ignore=kernels/test_block_fp8.py \ - --ignore=kernels/test_cutlass_moe.py \ - --ignore=kernels/test_mamba_ssm_ssd.py \ - --ignore=kernels/test_attention.py \ - --ignore=kernels/test_block_int8.py \ - --ignore=kernels/test_fused_quant_layernorm.py \ - --ignore=kernels/test_int8_kernel.py \ - --ignore=kernels/test_triton_moe_ptpc_fp8.py \ - --ignore=kernels/test_permute_cols.py" + --ignore=kernels/core/test_fused_quant_layernorm.py \ + --ignore=kernels/core/test_permute_cols.py" +fi + +if [[ $commands == *" kernels/attention"* ]]; then + commands="${commands} \ + --ignore=kernels/attention/stest_attention_selector.py \ + --ignore=kernels/attention/test_blocksparse_attention.py \ + --ignore=kernels/attention/test_encoder_decoder_attn.py \ + --ignore=kernels/attention/test_attention_selector.py \ + --ignore=kernels/attention/test_flash_attn.py \ + --ignore=kernels/attention/test_flashinfer.py \ + --ignore=kernels/attention/test_prefix_prefill.py \ + --ignore=kernels/attention/test_cascade_flash_attn.py \ + --ignore=kernels/attention/test_mha_attn.py \ + --ignore=kernels/attention/test_lightning_attn.py \ + --ignore=kernels/attention/test_attention.py" +fi + +if [[ $commands == *" kernels/quantization"* ]]; then + commands="${commands} \ + --ignore=kernels/quantization/test_int8_quant.py \ + --ignore=kernels/quantization/test_aqlm.py \ + --ignore=kernels/quantization/test_machete_mm.py \ + --ignore=kernels/quantization/test_block_fp8.py \ + --ignore=kernels/quantization/test_block_int8.py \ + --ignore=kernels/quantization/test_marlin_gemm.py \ + --ignore=kernels/quantization/test_cutlass_scaled_mm.py \ + --ignore=kernels/quantization/test_int8_kernel.py" +fi + +if [[ $commands == *" kernels/mamba"* ]]; then + commands="${commands} \ + --ignore=kernels/mamba/test_mamba_mixer2.py \ + --ignore=kernels/mamba/test_causal_conv1d.py \ + --ignore=kernels/mamba/test_mamba_ssm_ssd.py" +fi + +if [[ $commands == *" kernels/moe"* ]]; then + commands="${commands} \ + --ignore=kernels/moe/test_moe.py \ + --ignore=kernels/moe/test_cutlass_moe.py \ + --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py" fi #ignore certain Entrypoints/openai tests diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 20d858cb1..fc0eb3d9f 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -319,6 +319,7 @@ steps: - pytest -v -s compile/test_full_graph.py - label: Kernels Core Operation Test + mirror_hardwares: [amd] source_file_dependencies: - csrc/ - tests/kernels/core @@ -326,6 +327,7 @@ steps: - pytest -v -s kernels/core - label: Kernels Attention Test %N + mirror_hardwares: [amd] source_file_dependencies: - csrc/attention/ - vllm/attention @@ -336,6 +338,7 @@ steps: parallelism: 2 - label: Kernels Quantization Test %N + mirror_hardwares: [amd] source_file_dependencies: - csrc/quantization/ - vllm/model_executor/layers/quantization @@ -345,6 +348,7 @@ steps: parallelism: 2 - label: Kernels MoE Test + #mirror_hardwares: [amd] source_file_dependencies: - csrc/moe/ - tests/kernels/moe @@ -353,6 +357,7 @@ steps: - pytest -v -s kernels/moe - label: Kernels Mamba Test + #mirror_hardwares: [amd] source_file_dependencies: - csrc/mamba/ - tests/kernels/mamba -- GitLab From 56d64fbe3026b5ccef9cf9c7f069cd0e892df155 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Tue, 29 Apr 2025 13:29:44 -0400 Subject: [PATCH 031/461] [Docs] Propose a deprecation policy for the project (#17063) Signed-off-by: Russell Bryant Co-authored-by: Cyrus Leung --- .../source/contributing/deprecation_policy.md | 87 +++++++++++++++++++ docs/source/design/v1/metrics.md | 3 + docs/source/index.md | 1 + 3 files changed, 91 insertions(+) create mode 100644 docs/source/contributing/deprecation_policy.md diff --git a/docs/source/contributing/deprecation_policy.md b/docs/source/contributing/deprecation_policy.md new file mode 100644 index 000000000..598f1612d --- /dev/null +++ b/docs/source/contributing/deprecation_policy.md @@ -0,0 +1,87 @@ +# Deprecation Policy + +This document outlines the official policy and process for deprecating features +in the vLLM project. + +## Overview + +vLLM uses a structured "deprecation pipeline" to guide the lifecycle of +deprecated features. This policy ensures that users are given clear and +sufficient notice when a feature is deprecated and that deprecations proceed in +a consistent and predictable manner. + +We aim to strike a balance between continued innovation and respecting users’ +reliance on existing functionality. Deprecations are tied to our **minor (Y) +releases** following semantic versioning (X.Y.Z), where: + +- **X** is a major version (rare) +- **Y** is a minor version (used for significant changes, including deprecations/removals) +- **Z** is a patch version (used for fixes and safer enhancements) + +Features that fall under this policy include (at a minimum) the following: + +- CLI flags +- Environment variables +- Configuration files +- APIs in the OpenAI-compatible API server +- Public Python APIs for the `vllm` library + +## Deprecation Pipeline + +The deprecation process consists of several clearly defined stages that span +multiple Y releases: + +**1. Deprecated (Still On By Default)** + +- **Action**: Feature is marked as deprecated. +- **Timeline**: A removal version is explicitly stated in the deprecation +warning (e.g., "This will be removed in v0.10.0"). +- **Communication**: Deprecation is noted in the following, as applicable: + - Help strings + - Log output + - API responses + - `/metrics` output (for metrics features) + - User-facing documentation + - Release notes + - GitHub Issue (RFC) for feedback + - Documentation and use of the `@typing_extensions.deprecated` decorator for Python APIs + +**2.Deprecated (Off By Default)** + +- **Action**: Feature is disabled by default, but can still be re-enabled via a +CLI flag or environment variable. Feature throws an error when used without +re-enabling. +- **Purpose**: Allows users who missed earlier warnings a temporary escape hatch +while signaling imminent removal. Ensures any remaining usage is clearly +surfaced and blocks silent breakage before full removal. + +**3. Removed** + +- **Action**: Feature is completely removed from the codebase. +- **Note**: Only features that have passed through the previous deprecation +stages will be removed. + +## Example Timeline + +Assume a feature is deprecated in `v0.9.0`. + +| Release | Status | +|---------------|-------------------------------------------------------------------------------------------------| +| `v0.9.0` | Feature is deprecated with clear removal version listed. | +| `v0.10.0` | Feature is now off by default, throws an error when used, and can be re-enabled for legacy use. | +| `v0.11.0` | Feature is removed. | + +## Important Guidelines + +- **No Removals in Patch Releases**: Removing deprecated features in patch +(`.Z`) releases is disallowed to avoid surprising users. +- **Grace Period for Existing Deprecations**: Any feature deprecated **before +this policy** will have its grace period start **now**, not retroactively. +- **Documentation is Critical**: Ensure every stage of the pipeline is +documented clearly for users. + +## Final Notes + +This policy is a living document and may evolve as the needs of the project and +its users change. Community feedback is welcome and encouraged as we refine the +process. diff --git a/docs/source/design/v1/metrics.md b/docs/source/design/v1/metrics.md index 3f9629079..7e7c8b925 100644 --- a/docs/source/design/v1/metrics.md +++ b/docs/source/design/v1/metrics.md @@ -467,6 +467,9 @@ In general: hatch](https://kubernetes.io/docs/concepts/cluster-administration/system-metrics/#show-hidden-metrics) for some time before deleting them. +See the [deprecation policy](project:../../contributing/deprecation_policy.md) for +the project-wide deprecation policy. + ### Unimplemented - `vllm:tokens_total` Added by , but apparently never implemented. This can just be diff --git a/docs/source/index.md b/docs/source/index.md index 43b330e4b..56ff7a485 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -181,6 +181,7 @@ design/v1/metrics :maxdepth: 2 contributing/overview +contributing/deprecation_policy contributing/profiling/profiling_index contributing/dockerfile/dockerfile contributing/model/index -- GitLab From 0c1c7883122d302d7a8b679a5c99c497c1464553 Mon Sep 17 00:00:00 2001 From: casinca <47400729+casinca@users.noreply.github.com> Date: Tue, 29 Apr 2025 19:29:48 +0200 Subject: [PATCH 032/461] [Doc][Typo] Fixing label in new model requests link in overview.md (#17400) --- docs/source/contributing/overview.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md index 31c7059fd..3dceec1e5 100644 --- a/docs/source/contributing/overview.md +++ b/docs/source/contributing/overview.md @@ -17,7 +17,7 @@ Unsure on where to start? Check out the following links for tasks to work on: - [Good first issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22good%20first%20issue%22) - [Selected onboarding tasks](gh-project:6) -- [New model requests](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22new%20model%22) +- [New model requests](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22new-model%22) - [Models with multi-modal capabilities](gh-project:10) ## License -- GitLab From 792595b59d01eb7060b6cbc6dec20c129ea917b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Tue, 29 Apr 2025 19:36:48 +0200 Subject: [PATCH 033/461] [TPU][V1][CI] Replace `python3 setup.py develop` with standard `pip install --e` on TPU (#17374) Signed-off-by: NickLucche --- docker/Dockerfile.tpu | 2 +- .../getting_started/installation/ai_accelerator/tpu.inc.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile.tpu b/docker/Dockerfile.tpu index 50806d882..295270d29 100644 --- a/docker/Dockerfile.tpu +++ b/docker/Dockerfile.tpu @@ -23,7 +23,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=bind,source=.git,target=.git \ python3 -m pip install \ -r requirements/tpu.txt -RUN python3 setup.py develop +RUN python3 -m pip install -e . # install development dependencies (for testing) RUN python3 -m pip install -e tests/vllm_test_utils diff --git a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md index 8beb92ef7..4459cc61e 100644 --- a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md +++ b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md @@ -158,7 +158,7 @@ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev Run the setup script: ```bash -VLLM_TARGET_DEVICE="tpu" python setup.py develop +VLLM_TARGET_DEVICE="tpu" python -m pip install -e . ``` ## Set up using Docker -- GitLab From b37685afbb8fa8ac7530ae15db6a96e23a83281d Mon Sep 17 00:00:00 2001 From: Aaron Pham Date: Tue, 29 Apr 2025 13:39:16 -0400 Subject: [PATCH 034/461] [CI] Uses Python 3.11 for TPU (#17359) Signed-off-by: Aaron Pham --- requirements/tpu.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/tpu.txt b/requirements/tpu.txt index b63993ba1..16c0ad3ec 100644 --- a/requirements/tpu.txt +++ b/requirements/tpu.txt @@ -9,6 +9,7 @@ wheel jinja2>=3.1.6 ray[default] ray[data] +setuptools==78.1.0 # Install torch_xla --pre -- GitLab From 08e15defa9e1b48a34d33e88dffe99e69cae1ab0 Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Wed, 30 Apr 2025 01:40:52 +0800 Subject: [PATCH 035/461] [CI/Build] Add retry mechanism for add-apt-repository (#17107) Signed-off-by: reidliu41 Co-authored-by: reidliu41 --- docker/Dockerfile | 10 ++++++++-- docker/Dockerfile.nightly_torch | 10 ++++++++-- docker/Dockerfile.rocm_base | 5 ++++- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 1b28845d0..7d1fac9db 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -19,7 +19,10 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ && apt-get update -y \ && apt-get install -y ccache software-properties-common git curl sudo \ - && add-apt-repository ppa:deadsnakes/ppa \ + && for i in 1 2 3; do \ + add-apt-repository -y ppa:deadsnakes/ppa && break || \ + { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \ + done \ && apt-get update -y \ && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ @@ -188,7 +191,10 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ && apt-get update -y \ && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \ && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ - && add-apt-repository ppa:deadsnakes/ppa \ + && for i in 1 2 3; do \ + add-apt-repository -y ppa:deadsnakes/ppa && break || \ + { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \ + done \ && apt-get update -y \ && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch index 0063712e4..6989106c4 100644 --- a/docker/Dockerfile.nightly_torch +++ b/docker/Dockerfile.nightly_torch @@ -16,7 +16,10 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ && apt-get update -y \ && apt-get install -y ccache software-properties-common git curl sudo \ - && add-apt-repository ppa:deadsnakes/ppa \ + && for i in 1 2 3; do \ + add-apt-repository -y ppa:deadsnakes/ppa && break || \ + { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \ + done \ && apt-get update -y \ && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ @@ -197,7 +200,10 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ && apt-get update -y \ && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \ && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ - && add-apt-repository ppa:deadsnakes/ppa \ + && for i in 1 2 3; do \ + add-apt-repository -y ppa:deadsnakes/ppa && break || \ + { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \ + done \ && apt-get update -y \ && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base index 1776b26d4..12009b8aa 100644 --- a/docker/Dockerfile.rocm_base +++ b/docker/Dockerfile.rocm_base @@ -32,7 +32,10 @@ ENV DEBIAN_FRONTEND=noninteractive # Install Python and other dependencies RUN apt-get update -y \ && apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \ - && add-apt-repository ppa:deadsnakes/ppa \ + && for i in 1 2 3; do \ + add-apt-repository -y ppa:deadsnakes/ppa && break || \ + { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \ + done \ && apt-get update -y \ && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ python${PYTHON_VERSION}-lib2to3 python-is-python3 \ -- GitLab From 2fa2a50bf950797cb59d48908d205c655ec02654 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Wed, 30 Apr 2025 02:21:42 +0800 Subject: [PATCH 036/461] [Bugfix] Fix Minicpm-O-int4 GPTQ model inference (#17397) Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/minicpmo.py | 36 +++++++++++++++++++++++++- vllm/model_executor/models/minicpmv.py | 2 +- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index a2ca92cde..f42d48e91 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -28,12 +28,16 @@ from typing import (Any, Callable, Literal, Optional, Set, Tuple, TypedDict, import torch from torch import nn -from transformers import BatchFeature +from transformers import BatchFeature, PretrainedConfig from transformers.modeling_outputs import BaseModelOutputWithPast from transformers.models.whisper.modeling_whisper import ( ACT2FN, WHISPER_ATTENTION_CLASSES, WhisperConfig, WhisperEncoder) from vllm.config import VllmConfig +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.quantization.gptq import GPTQConfig +from vllm.model_executor.layers.quantization.gptq_marlin import ( + GPTQMarlinConfig) from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, NestedTensors) @@ -512,6 +516,36 @@ class MiniCPMO(MiniCPMV2_6): self.audio_token_id = None + def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig): + # GPTQ configs do not have a list of ignored modules, however AutoGPTQ + # seems to avoid vision encoder sections for some models. + # See: https://huggingface.co/openbmb/MiniCPM-o-2_6-int4 + if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)): + return None + return quant_config + + def init_vision_module( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> nn.Module: + # MiniCPMO GPTQ model leave vpm unquantized. + quant_config = self._maybe_ignore_quant_config(quant_config) + return super().init_vision_module(config, quant_config, prefix) + + def init_resampler( + self, + embed_dim: int, + vision_dim: int, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> nn.Module: + # MiniCPMO GPTQ model leave resampler unquantized. + quant_config = self._maybe_ignore_quant_config(quant_config) + return super().init_resampler(embed_dim, vision_dim, quant_config, + prefix) + def init_audio_module(self, *, vllm_config: VllmConfig, prefix: str = ""): # Do not use parameters temporarily audio_config = self.config.audio_config diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 65a26eadd..300360f78 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -1181,7 +1181,7 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA): def init_vision_module( self, config: PretrainedConfig, - quant_config: Optional[QuantizationConfig], + quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ) -> nn.Module: model = Idefics2VisionTransformer(config.vision_config, -- GitLab From a6977dbd1531378456725e5cdb151c88a33df52a Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 29 Apr 2025 20:02:23 +0100 Subject: [PATCH 037/461] Simplify (and fix) passing of guided decoding backend options (#17008) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- ...enai_chat_completion_structured_outputs.py | 7 +- tests/entrypoints/llm/test_guided_generate.py | 206 +++++++++++------- .../model_executor/test_guided_processors.py | 15 +- .../llm/test_struct_output_generate.py | 31 +-- tests/v1/test_oracle.py | 3 +- vllm/config.py | 70 +++++- vllm/engine/arg_utils.py | 36 ++- vllm/engine/llm_engine.py | 2 +- vllm/engine/multiprocessing/client.py | 4 +- .../guided_decoding/__init__.py | 24 +- .../guided_decoding/guidance_decoding.py | 5 +- .../guided_decoding/xgrammar_decoding.py | 12 +- vllm/sampling_params.py | 56 +++-- vllm/v1/engine/processor.py | 8 +- vllm/v1/structured_output/__init__.py | 8 +- vllm/v1/structured_output/backend_guidance.py | 25 +-- vllm/v1/structured_output/backend_xgrammar.py | 14 +- 17 files changed, 309 insertions(+), 217 deletions(-) diff --git a/examples/online_serving/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py index f71162e36..9c57af1c1 100644 --- a/examples/online_serving/openai_chat_completion_structured_outputs.py +++ b/examples/online_serving/openai_chat_completion_structured_outputs.py @@ -112,8 +112,8 @@ def extra_backend_options_completion(client: OpenAI, model: str): "alan.turing@enigma.com\n") try: - # The no-fallback option forces vLLM to use xgrammar, so when it fails - # you get a 400 with the reason why + # The guided_decoding_disable_fallback option forces vLLM to use + # xgrammar, so when it fails you get a 400 with the reason why completion = client.chat.completions.create( model=model, messages=[{ @@ -123,7 +123,8 @@ def extra_backend_options_completion(client: OpenAI, model: str): extra_body={ "guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"], - "guided_decoding_backend": "xgrammar:no-fallback" + "guided_decoding_backend": "xgrammar", + "guided_decoding_disable_fallback": True, }, ) return completion.choices[0].message.content diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py index ad726fa8c..fdbdccd46 100644 --- a/tests/entrypoints/llm/test_guided_generate.py +++ b/tests/entrypoints/llm/test_guided_generate.py @@ -16,10 +16,11 @@ from vllm.sampling_params import GuidedDecodingParams, SamplingParams MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" GUIDED_DECODING_BACKENDS = [ - "outlines", - "lm-format-enforcer", - "xgrammar:disable-any-whitespace", - "guidance:disable-any-whitespace", + # (backend, disable_any_whitespace), + ("outlines", False), + ("lm-format-enforcer", False), + ("xgrammar", True), + ("guidance", True), ] @@ -36,13 +37,17 @@ def llm(): @pytest.mark.skip_global_cleanup -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) -def test_guided_regex(sample_regex, llm, guided_decoding_backend: str): - sampling_params = SamplingParams(temperature=0.8, - top_p=0.95, - guided_decoding=GuidedDecodingParams( - regex=sample_regex, - backend=guided_decoding_backend)) +@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace", + GUIDED_DECODING_BACKENDS) +def test_guided_regex(sample_regex, llm, guided_decoding_backend: str, + disable_any_whitespace: bool): + sampling_params = SamplingParams( + temperature=0.8, + top_p=0.95, + guided_decoding=GuidedDecodingParams( + regex=sample_regex, + backend=guided_decoding_backend, + disable_any_whitespace=disable_any_whitespace)) outputs = llm.generate(prompts=[ f"Give an example IPv4 address with this regex: {sample_regex}" ] * 2, @@ -62,14 +67,18 @@ def test_guided_regex(sample_regex, llm, guided_decoding_backend: str): @pytest.mark.skip_global_cleanup -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) +@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace", + GUIDED_DECODING_BACKENDS) def test_guided_json_completion(sample_json_schema, llm, - guided_decoding_backend: str): - sampling_params = SamplingParams(temperature=1.0, - max_tokens=1000, - guided_decoding=GuidedDecodingParams( - json=sample_json_schema, - backend=guided_decoding_backend)) + guided_decoding_backend: str, + disable_any_whitespace: bool): + sampling_params = SamplingParams( + temperature=1.0, + max_tokens=1000, + guided_decoding=GuidedDecodingParams( + json=sample_json_schema, + backend=guided_decoding_backend, + disable_any_whitespace=disable_any_whitespace)) outputs = llm.generate(prompts=[ f"Give an example JSON for an employee profile " f"that fits this schema: {sample_json_schema}" @@ -92,14 +101,18 @@ def test_guided_json_completion(sample_json_schema, llm, @pytest.mark.skip_global_cleanup -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) +@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace", + GUIDED_DECODING_BACKENDS) def test_guided_complex_json_completion(sample_complex_json_schema, llm, - guided_decoding_backend: str): - sampling_params = SamplingParams(temperature=1.0, - max_tokens=1000, - guided_decoding=GuidedDecodingParams( - json=sample_complex_json_schema, - backend=guided_decoding_backend)) + guided_decoding_backend: str, + disable_any_whitespace: bool): + sampling_params = SamplingParams( + temperature=1.0, + max_tokens=1000, + guided_decoding=GuidedDecodingParams( + json=sample_complex_json_schema, + backend=guided_decoding_backend, + disable_any_whitespace=disable_any_whitespace)) outputs = llm.generate(prompts=[ f"Give an example JSON for an assignment grade " f"that fits this schema: {sample_complex_json_schema}" @@ -123,14 +136,18 @@ def test_guided_complex_json_completion(sample_complex_json_schema, llm, @pytest.mark.skip_global_cleanup -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) +@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace", + GUIDED_DECODING_BACKENDS) def test_guided_definition_json_completion(sample_definition_json_schema, llm, - guided_decoding_backend: str): - sampling_params = SamplingParams(temperature=1.0, - max_tokens=1000, - guided_decoding=GuidedDecodingParams( - json=sample_definition_json_schema, - backend=guided_decoding_backend)) + guided_decoding_backend: str, + disable_any_whitespace: bool): + sampling_params = SamplingParams( + temperature=1.0, + max_tokens=1000, + guided_decoding=GuidedDecodingParams( + json=sample_definition_json_schema, + backend=guided_decoding_backend, + disable_any_whitespace=disable_any_whitespace)) outputs = llm.generate(prompts=[ f"Give an example JSON for solving 8x + 7 = -23 " f"that fits this schema: {sample_definition_json_schema}" @@ -154,14 +171,18 @@ def test_guided_definition_json_completion(sample_definition_json_schema, llm, @pytest.mark.skip_global_cleanup -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) +@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace", + GUIDED_DECODING_BACKENDS) def test_guided_enum_json_completion(sample_enum_json_schema, llm, - guided_decoding_backend: str): - sampling_params = SamplingParams(temperature=1.0, - max_tokens=1000, - guided_decoding=GuidedDecodingParams( - json=sample_enum_json_schema, - backend=guided_decoding_backend)) + guided_decoding_backend: str, + disable_any_whitespace: bool): + sampling_params = SamplingParams( + temperature=1.0, + max_tokens=1000, + guided_decoding=GuidedDecodingParams( + json=sample_enum_json_schema, + backend=guided_decoding_backend, + disable_any_whitespace=disable_any_whitespace)) outputs = llm.generate(prompts=[ "Create a bug report JSON that fits this schema: " f"{sample_enum_json_schema}. Make it for a high priority critical bug." @@ -195,14 +216,18 @@ def test_guided_enum_json_completion(sample_enum_json_schema, llm, @pytest.mark.skip_global_cleanup -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) +@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace", + GUIDED_DECODING_BACKENDS) def test_guided_choice_completion(sample_guided_choice, llm, - guided_decoding_backend: str): - sampling_params = SamplingParams(temperature=0.8, - top_p=0.95, - guided_decoding=GuidedDecodingParams( - choice=sample_guided_choice, - backend=guided_decoding_backend)) + guided_decoding_backend: str, + disable_any_whitespace: bool): + sampling_params = SamplingParams( + temperature=0.8, + top_p=0.95, + guided_decoding=GuidedDecodingParams( + choice=sample_guided_choice, + backend=guided_decoding_backend, + disable_any_whitespace=disable_any_whitespace)) outputs = llm.generate( prompts="The best language for type-safe systems programming is ", sampling_params=sampling_params, @@ -221,15 +246,19 @@ def test_guided_choice_completion(sample_guided_choice, llm, @pytest.mark.skip_global_cleanup -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) +@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace", + GUIDED_DECODING_BACKENDS) def test_guided_grammar(sample_sql_statements, llm, - guided_decoding_backend: str): - sampling_params = SamplingParams(temperature=0.8, - top_p=0.95, - max_tokens=1000, - guided_decoding=GuidedDecodingParams( - grammar=sample_sql_statements, - backend=guided_decoding_backend)) + guided_decoding_backend: str, + disable_any_whitespace: bool): + sampling_params = SamplingParams( + temperature=0.8, + top_p=0.95, + max_tokens=1000, + guided_decoding=GuidedDecodingParams( + grammar=sample_sql_statements, + backend=guided_decoding_backend, + disable_any_whitespace=disable_any_whitespace)) outputs = llm.generate( prompts=("Generate a sql state that select col_1 from " "table_1 where it is equals to 1"), @@ -300,7 +329,8 @@ def test_disable_guided_decoding_fallback(sample_regex, llm): top_p=0.95, guided_decoding=GuidedDecodingParams( json=unsupported_json, - backend="xgrammar:no-fallback")) + backend="xgrammar", + disable_fallback=True)) with pytest.raises( ValueError, @@ -312,14 +342,18 @@ def test_disable_guided_decoding_fallback(sample_regex, llm): @pytest.mark.skip_global_cleanup -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) -def test_guided_json_object(llm, guided_decoding_backend: str): - sampling_params = SamplingParams(temperature=1.0, - max_tokens=100, - n=2, - guided_decoding=GuidedDecodingParams( - json_object=True, - backend=guided_decoding_backend)) +@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace", + GUIDED_DECODING_BACKENDS) +def test_guided_json_object(llm, guided_decoding_backend: str, + disable_any_whitespace: bool): + sampling_params = SamplingParams( + temperature=1.0, + max_tokens=100, + n=2, + guided_decoding=GuidedDecodingParams( + json_object=True, + backend=guided_decoding_backend, + disable_any_whitespace=disable_any_whitespace)) outputs = llm.generate( prompts=("Generate a JSON object with curly braces for a person with " @@ -337,7 +371,7 @@ def test_guided_json_object(llm, guided_decoding_backend: str): print(generated_text) assert generated_text is not None - if 'disable-any-whitespace' in guided_decoding_backend: + if disable_any_whitespace: assert "\n" not in generated_text # Parse to verify it is valid JSON @@ -359,14 +393,18 @@ class CarDescription(BaseModel): @pytest.mark.skip_global_cleanup -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) -def test_guided_json_completion_with_enum(llm, guided_decoding_backend: str): +@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace", + GUIDED_DECODING_BACKENDS) +def test_guided_json_completion_with_enum(llm, guided_decoding_backend: str, + disable_any_whitespace: bool): json_schema = CarDescription.model_json_schema() - sampling_params = SamplingParams(temperature=1.0, - max_tokens=1000, - guided_decoding=GuidedDecodingParams( - json=json_schema, - backend=guided_decoding_backend)) + sampling_params = SamplingParams( + temperature=1.0, + max_tokens=1000, + guided_decoding=GuidedDecodingParams( + json=json_schema, + backend=guided_decoding_backend, + disable_any_whitespace=disable_any_whitespace)) outputs = llm.generate( prompts="Generate a JSON with the brand, model and car_type of" "the most iconic car from the 90's", @@ -387,9 +425,10 @@ def test_guided_json_completion_with_enum(llm, guided_decoding_backend: str): @pytest.mark.skip_global_cleanup -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) -def test_guided_number_range_json_completion(llm, - guided_decoding_backend: str): +@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace", + GUIDED_DECODING_BACKENDS) +def test_guided_number_range_json_completion(llm, guided_decoding_backend: str, + disable_any_whitespace: bool): sample_output_schema = { "type": "object", "properties": { @@ -413,8 +452,10 @@ def test_guided_number_range_json_completion(llm, sampling_params = SamplingParams( temperature=1.0, max_tokens=1000, - guided_decoding=GuidedDecodingParams(json=sample_output_schema, - backend=guided_decoding_backend), + guided_decoding=GuidedDecodingParams( + json=sample_output_schema, + backend=guided_decoding_backend, + disable_any_whitespace=disable_any_whitespace), ) outputs = llm.generate( prompts=[ @@ -466,8 +507,12 @@ def test_guidance_no_additional_properties(llm): "large JSON object with key-value pairs a1=b1, a2=b2, ..., a20=b20" "<|im_end|>\n<|im_start|>assistant\n") - def generate_with_backend(backend): - guided_params = GuidedDecodingParams(json=schema, backend=backend) + def generate_with_backend(backend, disable_additional_properties): + guided_params = GuidedDecodingParams( + json=schema, + backend=backend, + disable_any_whitespace=True, + disable_additional_properties=disable_additional_properties) sampling_params = SamplingParams(temperature=0, max_tokens=256, guided_decoding=guided_params) @@ -481,7 +526,7 @@ def test_guidance_no_additional_properties(llm): jsonschema.validate(instance=parsed_json, schema=schema) return parsed_json - base_generated = generate_with_backend('guidance:disable-any-whitespace') + base_generated = generate_with_backend("guidance", False) assert "a1" in base_generated assert "a2" in base_generated assert "a3" in base_generated @@ -490,8 +535,7 @@ def test_guidance_no_additional_properties(llm): assert "a5" in base_generated assert "a6" in base_generated - generated = generate_with_backend( - 'guidance:no-additional-properties,disable-any-whitespace') + generated = generate_with_backend("guidance", True) assert "a1" in generated assert "a2" in generated assert "a3" in generated diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py index 59da575e3..6cd966f84 100644 --- a/tests/model_executor/test_guided_processors.py +++ b/tests/model_executor/test_guided_processors.py @@ -202,12 +202,15 @@ def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex): def test_guided_decoding_backend_options(): """Test backend-specific options""" - params = GuidedDecodingParams( - backend="xgrammar:option-1,option-2,option-3") - assert params.backend_options() == ["option-1", "option-2", "option-3"] - - no_fallback = GuidedDecodingParams(backend="xgrammar:option-1,no-fallback") - assert no_fallback.no_fallback() + with pytest.warns(DeprecationWarning): + guided_decoding_params = GuidedDecodingParams( + backend= + "xgrammar:no-fallback,disable-any-whitespace,no-additional-properties" + ) + assert guided_decoding_params.backend == "xgrammar" + assert guided_decoding_params.disable_fallback + assert guided_decoding_params.disable_any_whitespace + assert guided_decoding_params.disable_additional_properties def test_pickle_xgrammar_tokenizer_data(): diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index 3de4fec9c..29ec6088e 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -17,15 +17,12 @@ from vllm.platforms import current_platform from vllm.sampling_params import GuidedDecodingParams, SamplingParams PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [ - ("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace", - "auto"), - ("mistralai/Ministral-8B-Instruct-2410", "guidance:disable-any-whitespace", - "auto"), - ("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace", - "mistral"), - ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar:disable-any-whitespace", "auto"), + ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "auto"), + ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto"), + ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "mistral"), + ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto"), #FIXME: This test is flaky on CI thus disabled - #("Qwen/Qwen2.5-1.5B-Instruct", "guidance:disable-any-whitespace", "auto"), + #("Qwen/Qwen2.5-1.5B-Instruct", "guidance", "auto"), ] PARAMS_MODELS_TOKENIZER_MODE = [ @@ -73,6 +70,7 @@ def test_structured_output( enforce_eager=enforce_eager, max_model_len=1024, guided_decoding_backend=guided_decoding_backend, + guided_decoding_disable_any_whitespace=True, tokenizer_mode=tokenizer_mode) # @@ -98,8 +96,7 @@ def test_structured_output( generated_text = output.outputs[0].text assert generated_text is not None - if 'disable-any-whitespace' in guided_decoding_backend: - assert "\n" not in generated_text + assert "\n" not in generated_text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") output_json = json.loads(generated_text) jsonschema.validate(instance=output_json, schema=sample_json_schema) @@ -520,10 +517,11 @@ def test_structured_output_auto_mode( def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch): monkeypatch.setenv("VLLM_USE_V1", "1") - backend = 'guidance:no-additional-properties,disable-any-whitespace' llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct", max_model_len=1024, - guided_decoding_backend=backend) + guided_decoding_backend="guidance", + guided_decoding_disable_any_whitespace=True, + guided_decoding_disable_additional_properties=True) schema = { 'type': 'object', @@ -548,7 +546,11 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch): "<|im_end|>\n<|im_start|>assistant\n") def generate_with_backend(backend): - guided_params = GuidedDecodingParams(json=schema, backend=backend) + guided_params = GuidedDecodingParams( + json=schema, + backend=backend, + disable_any_whitespace=True, + disable_additional_properties=True) sampling_params = SamplingParams(temperature=0, max_tokens=256, guided_decoding=guided_params) @@ -562,8 +564,7 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch): jsonschema.validate(instance=parsed_json, schema=schema) return parsed_json - generated = generate_with_backend( - 'guidance:no-additional-properties,disable-any-whitespace') + generated = generate_with_backend("guidance") assert "a1" in generated assert "a2" in generated assert "a3" in generated diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py index 1448641f6..94c8ad7c9 100644 --- a/tests/v1/test_oracle.py +++ b/tests/v1/test_oracle.py @@ -57,7 +57,8 @@ def test_unsupported_configs(monkeypatch): with pytest.raises(NotImplementedError): AsyncEngineArgs( model=MODEL, - guided_decoding_backend="lm-format-enforcer:no-fallback", + guided_decoding_backend="lm-format-enforcer", + guided_decoding_disable_fallback=True, ).create_engine_config() with pytest.raises(NotImplementedError): diff --git a/vllm/config.py b/vllm/config.py index 8f927835d..abe59734e 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -17,12 +17,14 @@ from dataclasses import (MISSING, dataclass, field, fields, is_dataclass, from importlib.util import find_spec from pathlib import Path from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Final, Literal, - Optional, Protocol, TypeVar, Union, get_args, get_origin) + Optional, Protocol, TypeVar, Union, cast, get_args, + get_origin) import torch from pydantic import BaseModel, Field, PrivateAttr from torch.distributed import ProcessGroup, ReduceOp from transformers import PretrainedConfig +from typing_extensions import deprecated import vllm.envs as envs from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass @@ -32,7 +34,6 @@ from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS, get_quantization_config) from vllm.model_executor.models import ModelRegistry from vllm.platforms import CpuArchEnum, current_platform -from vllm.sampling_params import GuidedDecodingParams from vllm.tracing import is_otel_available, otel_import_error_traceback from vllm.transformers_utils.config import ( ConfigFormat, get_config, get_hf_image_processor_config, @@ -344,7 +345,7 @@ class ModelConfig: def __init__( self, model: str, - task: Union[TaskOption, Literal["draft"]], + task: Literal[TaskOption, Literal["draft"]], tokenizer: str, tokenizer_mode: str, trust_remote_code: bool, @@ -701,7 +702,7 @@ class ModelConfig: def _resolve_task( self, - task_option: Union[TaskOption, Literal["draft"]], + task_option: Literal[TaskOption, Literal["draft"]], ) -> tuple[set[_ResolvedTask], _ResolvedTask]: if task_option == "draft": return {"draft"}, "draft" @@ -3185,13 +3186,36 @@ GuidedDecodingBackend = Literal[GuidedDecodingBackendV0, class DecodingConfig: """Dataclass which contains the decoding strategy of the engine.""" - guided_decoding_backend: GuidedDecodingBackend = \ - "auto" if envs.VLLM_USE_V1 else "xgrammar" + @property + @deprecated( + "`guided_decoding_backend` is deprecated and has been renamed to " + "`backend`. This will be removed in v0.10.0. Please use the " + "`backend` argument instead.") + def guided_decoding_backend(self) -> GuidedDecodingBackend: + return self.backend + + @guided_decoding_backend.setter + def guided_decoding_backend(self, value: GuidedDecodingBackend): + self.backend = value + + backend: GuidedDecodingBackend = "auto" if envs.VLLM_USE_V1 else "xgrammar" """Which engine will be used for guided decoding (JSON schema / regex etc) by default. With "auto", we will make opinionated choices based on request contents and what the backend libraries currently support, so the behavior is subject to change in each release.""" + disable_fallback: bool = False + """If `True`, vLLM will not fallback to a different backend on error.""" + + disable_any_whitespace: bool = False + """If `True`, the model will not generate any whitespace during guided + decoding. This is only supported for xgrammar and guidance backends.""" + + disable_additional_properties: bool = False + """If `True`, the `guidance` backend will not use `additionalProperties` + in the JSON schema. This is only supported for the `guidance` backend and + is used to better align its behaviour with `outlines` and `xgrammar`.""" + reasoning_backend: Optional[str] = None """Select the reasoning parser depending on the model that you're using. This is used to parse the reasoning content into OpenAI API format. @@ -3217,15 +3241,41 @@ class DecodingConfig: return hash_str def __post_init__(self): - backend = GuidedDecodingParams( - backend=self.guided_decoding_backend).backend_name + if ":" in self.backend: + self._extract_backend_options() + if envs.VLLM_USE_V1: valid_guided_backends = get_args(GuidedDecodingBackendV1) else: valid_guided_backends = get_args(GuidedDecodingBackendV0) - if backend not in valid_guided_backends: - raise ValueError(f"Invalid guided_decoding_backend '{backend}'," + if self.backend not in valid_guided_backends: + raise ValueError(f"Invalid backend '{self.backend}'," f" must be one of {valid_guided_backends}") + if (self.disable_any_whitespace + and self.backend not in ("xgrammar", "guidance")): + raise ValueError("disable_any_whitespace is only supported for " + "xgrammar and guidance backends.") + if (self.disable_additional_properties and self.backend != "guidance"): + raise ValueError("disable_additional_properties is only supported " + "for the guidance backend.") + + @deprecated( + "Passing guided decoding backend options inside backend in the format " + "'backend:...' is deprecated. This will be removed in v0.10.0. Please " + "use the dedicated arguments '--disable-fallback', " + "'--disable-any-whitespace' and '--disable-additional-properties' " + "instead.") + def _extract_backend_options(self): + """Extract backend options from the backend string.""" + backend, options = self.backend.split(":") + self.backend = cast(GuidedDecodingBackend, backend) + options_set = set(options.strip().split(",")) + if "no-fallback" in options_set: + self.disable_fallback = True + if "disable-any-whitespace" in options_set: + self.disable_any_whitespace = True + if "no-additional-properties" in options_set: + self.disable_additional_properties = True @dataclass diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index fe688025f..be0cd4d3a 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -18,9 +18,9 @@ from vllm import version from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, ConfigFormat, ConfigType, DecodingConfig, Device, DeviceConfig, DistributedExecutorBackend, - GuidedDecodingBackendV1, HfOverrides, - KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig, - ModelConfig, ModelImpl, MultiModalConfig, + GuidedDecodingBackend, GuidedDecodingBackendV1, + HfOverrides, KVTransferConfig, LoadConfig, LoadFormat, + LoRAConfig, ModelConfig, ModelImpl, MultiModalConfig, ObservabilityConfig, ParallelConfig, PoolerConfig, PrefixCachingHashAlgo, PromptAdapterConfig, SchedulerConfig, SchedulerPolicy, SpeculativeConfig, @@ -317,7 +317,12 @@ class EngineArgs: bool] = SchedulerConfig.enable_chunked_prefill disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input - guided_decoding_backend: str = DecodingConfig.guided_decoding_backend + guided_decoding_backend: GuidedDecodingBackend = DecodingConfig.backend + guided_decoding_disable_fallback: bool = DecodingConfig.disable_fallback + guided_decoding_disable_any_whitespace: bool = \ + DecodingConfig.disable_any_whitespace + guided_decoding_disable_additional_properties: bool = \ + DecodingConfig.disable_additional_properties logits_processor_pattern: Optional[str] = None speculative_config: Optional[Dict[str, Any]] = None @@ -498,9 +503,17 @@ class EngineArgs: title="DecodingConfig", description=DecodingConfig.__doc__, ) + guided_decoding_group.add_argument("--guided-decoding-backend", + **guided_decoding_kwargs["backend"]) guided_decoding_group.add_argument( - '--guided-decoding-backend', - **guided_decoding_kwargs["guided_decoding_backend"]) + "--guided-decoding-disable-fallback", + **guided_decoding_kwargs["disable_fallback"]) + guided_decoding_group.add_argument( + "--guided-decoding-disable-any-whitespace", + **guided_decoding_kwargs["disable_any_whitespace"]) + guided_decoding_group.add_argument( + "--guided-decoding-disable-additional-properties", + **guided_decoding_kwargs["disable_additional_properties"]) guided_decoding_group.add_argument( "--reasoning-parser", # This choices is a special case because it's not static @@ -1244,7 +1257,11 @@ class EngineArgs: if self.enable_prompt_adapter else None decoding_config = DecodingConfig( - guided_decoding_backend=self.guided_decoding_backend, + backend=self.guided_decoding_backend, + disable_fallback=self.guided_decoding_disable_fallback, + disable_any_whitespace=self.guided_decoding_disable_any_whitespace, + disable_additional_properties=\ + self.guided_decoding_disable_additional_properties, reasoning_backend=self.reasoning_parser if self.enable_reasoning else None, ) @@ -1335,9 +1352,8 @@ class EngineArgs: recommend_to_remove=True) return False - # remove backend options when doing this check - if self.guided_decoding_backend.split(':')[0] \ - not in get_args(GuidedDecodingBackendV1): + if self.guided_decoding_backend not in get_args( + GuidedDecodingBackendV1): _raise_or_fallback( feature_name= f"--guided-decoding-backend={self.guided_decoding_backend}", diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index c23530990..38f13d859 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -2091,7 +2091,7 @@ class LLMEngine: tokenizer = self.get_tokenizer(lora_request=lora_request) guided_decoding.backend = guided_decoding.backend or \ - self.decoding_config.guided_decoding_backend + self.decoding_config.backend if self.decoding_config.reasoning_backend is not None: logger.debug("Building with reasoning backend %s", diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index eb3ae8939..d23a4c6ed 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -615,9 +615,9 @@ class MQLLMEngineClient(EngineClient): build_guided_decoding_logits_processor_async( sampling_params=params, tokenizer=await self.get_tokenizer(lora_request), - default_guided_backend=(self.decoding_config.guided_decoding_backend + default_guided_backend=(self.decoding_config.backend if self.decoding_config - else DecodingConfig.guided_decoding_backend), + else DecodingConfig.backend), model_config=self.model_config, reasoning_backend=self.decoding_config.reasoning_backend, ) diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py index 8fdcdcafa..4e4d697f4 100644 --- a/vllm/model_executor/guided_decoding/__init__.py +++ b/vllm/model_executor/guided_decoding/__init__.py @@ -26,8 +26,8 @@ def maybe_backend_fallback( def fallback_or_error(guided_params: GuidedDecodingParams, message: str, fallback: str) -> None: """Change the backend to the specified fallback with a warning log, - or raise a ValueError if the `no-fallback` option is specified.""" - if guided_params.no_fallback(): + or raise a ValueError if the `disable_fallback` option is specified.""" + if guided_params.disable_fallback: raise ValueError(message) logger.warning("%s Falling back to use %s instead.", message, fallback) @@ -40,7 +40,7 @@ def maybe_backend_fallback( guided_params.backend = "xgrammar" # lm-format-enforce doesn't support grammar, fallback to xgrammar - if guided_params.backend_name == "lm-format-enforcer": + if guided_params.backend == "lm-format-enforcer": if guided_params.grammar is not None: fallback_or_error( guided_params, @@ -55,7 +55,7 @@ def maybe_backend_fallback( "lm-format-enforcer does not support advanced JSON schema " "features like patterns or numeric ranges.", "outlines") - if guided_params.backend_name == "xgrammar": + if guided_params.backend == "xgrammar": from vllm.model_executor.guided_decoding.xgrammar_decoding import ( xgr_installed) @@ -87,7 +87,7 @@ def maybe_backend_fallback( guided_params, "xgrammar module cannot be imported successfully.", "outlines") - if (guided_params.backend_name == "outlines" + if (guided_params.backend == "outlines" and guided_params.json_object is not None): # outlines doesn't support json_object, fallback to guidance fallback_or_error(guided_params, @@ -111,7 +111,7 @@ async def get_guided_decoding_logits_processor( guided_params = maybe_backend_fallback(guided_params) # CFG grammar not supported by LMFE, so we use outlines instead - if guided_params.backend_name == 'outlines': + if guided_params.backend == 'outlines': # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193 from vllm.model_executor.guided_decoding.outlines_decoding import ( # noqa get_outlines_guided_decoding_logits_processor) @@ -122,12 +122,12 @@ async def get_guided_decoding_logits_processor( get_local_lm_format_enforcer_guided_decoding_logits_processor) return get_local_lm_format_enforcer_guided_decoding_logits_processor( guided_params, tokenizer) - if guided_params.backend_name == 'xgrammar': + if guided_params.backend == 'xgrammar': from vllm.model_executor.guided_decoding.xgrammar_decoding import ( # noqa get_local_xgrammar_guided_decoding_logits_processor) return get_local_xgrammar_guided_decoding_logits_processor( guided_params, tokenizer, model_config, reasoner) - if guided_params.backend_name == 'guidance': + if guided_params.backend == 'guidance': from vllm.model_executor.guided_decoding.guidance_decoding import ( get_local_guidance_guided_decoding_logits_processor) return get_local_guidance_guided_decoding_logits_processor( @@ -152,23 +152,23 @@ def get_local_guided_decoding_logits_processor( reasoner = reasoner_class(tokenizer) # CFG grammar not supported by LMFE, so we use outlines instead - if guided_params.backend_name == 'outlines': + if guided_params.backend == 'outlines': # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193 from vllm.model_executor.guided_decoding.outlines_decoding import ( # noqa get_local_outlines_guided_decoding_logits_processor) return get_local_outlines_guided_decoding_logits_processor( guided_params, tokenizer, reasoner) - if guided_params.backend_name == 'lm-format-enforcer': + if guided_params.backend == 'lm-format-enforcer': from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import ( # noqa get_local_lm_format_enforcer_guided_decoding_logits_processor) return get_local_lm_format_enforcer_guided_decoding_logits_processor( guided_params, tokenizer) - if guided_params.backend_name == 'xgrammar': + if guided_params.backend == 'xgrammar': from vllm.model_executor.guided_decoding.xgrammar_decoding import ( # noqa get_local_xgrammar_guided_decoding_logits_processor) return get_local_xgrammar_guided_decoding_logits_processor( guided_params, tokenizer, model_config, reasoner) - if guided_params.backend_name == 'guidance': + if guided_params.backend == 'guidance': from vllm.model_executor.guided_decoding.guidance_decoding import ( get_local_guidance_guided_decoding_logits_processor) return get_local_guidance_guided_decoding_logits_processor( diff --git a/vllm/model_executor/guided_decoding/guidance_decoding.py b/vllm/model_executor/guided_decoding/guidance_decoding.py index 95b7c7110..0b1f4762b 100644 --- a/vllm/model_executor/guided_decoding/guidance_decoding.py +++ b/vllm/model_executor/guided_decoding/guidance_decoding.py @@ -21,13 +21,12 @@ def get_local_guidance_guided_decoding_logits_processor( """ grm = "" - any_whitespace = 'disable-any-whitespace' not in \ - guided_params.backend_options() + any_whitespace = not guided_params.disable_any_whitespace if (guide_json := guided_params.json) is not None: # Optionally set additionalProperties to False at the top-level # By default, other backends do not allow additional top-level # properties, so this makes guidance more similar to other backends - if 'no-additional-properties' in guided_params.backend_options(): + if guided_params.disable_additional_properties: if not isinstance(guide_json, str): guide_json = json.dumps(guide_json) guide_json = process_for_additional_properties(guide_json) diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py index ff223c3c9..40f722410 100644 --- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py +++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py @@ -175,8 +175,7 @@ class GrammarConfig: else: json_str = guided_params.json - any_whitespace = 'disable-any-whitespace' not in \ - guided_params.backend_options() + any_whitespace = not guided_params.disable_any_whitespace # Check and log if model with xgrammar and whitespace have history # of runaway generation of whitespaces. @@ -191,11 +190,10 @@ class GrammarConfig: model_with_warn = 'Qwen' if model_with_warn is not None and any_whitespace: - msg = (f"{model_with_warn} " - f"model detected, consider set " - f"`guided_backend=xgrammar:disable-any-whitespace` " - f"to prevent runaway generation of whitespaces.") - logger.info_once(msg) + logger.info_once( + "%s model detected, consider setting " + "`disable_any_whitespace` to prevent runaway generation " + "of whitespaces.", model_with_warn) # Validate the schema and raise ValueError here if it is invalid. # This is to avoid exceptions in model execution, which will crash # the engine worker process. diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index c430b74a9..511571d05 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -8,6 +8,7 @@ from typing import Annotated, Any, Optional, Union import msgspec from pydantic import BaseModel +from typing_extensions import deprecated from vllm.logger import init_logger from vllm.logits_process import LogitsProcessor @@ -37,6 +38,10 @@ class GuidedDecodingParams: json_object: Optional[bool] = None """These are other options that can be set""" backend: Optional[str] = None + backend_was_auto: bool = False + disable_fallback: bool = False + disable_any_whitespace: bool = False + disable_additional_properties: bool = False whitespace_pattern: Optional[str] = None structural_tag: Optional[str] = None @@ -68,36 +73,6 @@ class GuidedDecodingParams: structural_tag=structural_tag, ) - @property - def backend_name(self) -> str: - """Return the backend name without any options. - - For example if the backend is "xgrammar:no-fallback", returns "xgrammar" - """ - return (self.backend or "").split(":")[0] - - def backend_options(self) -> list[str]: - """Return the backend options as a list of strings.""" - if not self.backend or ":" not in self.backend: - return [] - return self.backend.split(":")[1].split(",") - - def add_option(self, opt_name: str) -> None: - """Adds an option to the backend options.""" - if not self.backend: - self.backend = f":{opt_name}" - elif ":" not in self.backend: - self.backend += f":{opt_name}" - else: - options = set(self.backend_options()) - options.add(opt_name) - self.backend = f"{self.backend_name}:{','.join(sorted(options))}" - - def no_fallback(self) -> bool: - """Returns True if the "no-fallback" option is supplied for the guided - decoding backend""" - return "no-fallback" in self.backend_options() - def __post_init__(self): """Validate that some fields are mutually exclusive.""" guide_count = sum([ @@ -109,6 +84,27 @@ class GuidedDecodingParams: "You can only use one kind of guided decoding but multiple are " f"specified: {self.__dict__}") + if self.backend is not None and ":" in self.backend: + self._extract_backend_options() + + @deprecated( + "Passing guided decoding backend options inside backend in the format " + "'backend:...' is deprecated. This will be removed in v0.10.0. Please " + "use the dedicated arguments '--disable-fallback', " + "'--disable-any-whitespace' and '--disable-additional-properties' " + "instead.") + def _extract_backend_options(self): + """Extract backend options from the backend string.""" + assert isinstance(self.backend, str) + self.backend, options = self.backend.split(":") + options_set = set(options.strip().split(",")) + if "no-fallback" in options_set: + self.disable_fallback = True + if "disable-any-whitespace" in options_set: + self.disable_any_whitespace = True + if "no-additional-properties" in options_set: + self.disable_additional_properties = True + class RequestOutputKind(Enum): # Return entire output so far in every RequestOutput diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 5c15e8bae..8ae5d0157 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -144,7 +144,7 @@ class Processor: if not params.guided_decoding or not self.decoding_config: return - engine_level_backend = self.decoding_config.guided_decoding_backend + engine_level_backend = self.decoding_config.backend if params.guided_decoding.backend: # Request-level backend selection is not supported in V1. # The values may differ if `params` is reused and was set @@ -152,8 +152,8 @@ class Processor: # request. We remember that it was set as a result of `auto` # using the `_auto` option set on the backend in the params. if (params.guided_decoding.backend != engine_level_backend - and not (engine_level_backend == "auto" and "_auto" - in params.guided_decoding.backend_options())): + and not (engine_level_backend == "auto" + and params.guided_decoding.backend_was_auto)): raise ValueError( "Request-level structured output backend selection is no " "longer supported. The request specified " @@ -189,7 +189,7 @@ class Processor: # are not supported in xgrammar. Fall back to guidance. params.guided_decoding.backend = "guidance" # Remember that this backend was set automatically - params.guided_decoding.add_option("_auto") + params.guided_decoding.backend_was_auto = True def process_inputs( self, diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 0fd66c072..47ae4c4f0 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -45,17 +45,17 @@ class StructuredOutputManager: # NOTE: We only support a single backend. We do NOT support different # backends on a per-request basis in V1 (for now, anyway...). if self.backend is None: - backend_name = request.sampling_params.guided_decoding.backend_name - if backend_name == "xgrammar": + backend = request.sampling_params.guided_decoding.backend + if backend == "xgrammar": from vllm.v1.structured_output.backend_xgrammar import ( XgrammarBackend) self.backend = XgrammarBackend(self.vllm_config) - elif backend_name == "guidance": + elif backend == "guidance": self.backend = GuidanceBackend(self.vllm_config) else: raise ValueError( - f"Unsupported structured output backend: {backend_name}") + f"Unsupported structured output backend: {backend}") grammar = self.executor.submit(self._async_create_grammar, request) request.structured_output_request.grammar = grammar # type: ignore[assignment] diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py index d4dc5e681..8fb3e56bc 100644 --- a/vllm/v1/structured_output/backend_guidance.py +++ b/vllm/v1/structured_output/backend_guidance.py @@ -10,7 +10,7 @@ import torch from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.sampling_params import GuidedDecodingParams, SamplingParams +from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.utils import LazyLoader from vllm.v1.structured_output.backend_types import (StructuredOutputBackend, @@ -65,19 +65,10 @@ class GuidanceBackend(StructuredOutputBackend): self.vllm_config = vllm_config self.vocab_size = vllm_config.model_config.get_vocab_size() - self.disable_any_whitespace = False - self.no_additional_properties = False - backend_options = GuidedDecodingParams( - backend=vllm_config.decoding_config.guided_decoding_backend - ).backend_options() - for option in backend_options: - if option == "disable-any-whitespace": - self.disable_any_whitespace = True - elif option == "no-additional-properties": - self.no_additional_properties = True - else: - raise ValueError( - f"Unsupported option for the guidance backend: {option}") + self.disable_any_whitespace = \ + vllm_config.decoding_config.disable_any_whitespace + self.disable_additional_properties = \ + vllm_config.decoding_config.disable_additional_properties tokenizer = tokenizer_group.get_lora_tokenizer(None) self.ll_tokenizer = llguidance_hf.from_tokenizer( @@ -87,7 +78,7 @@ class GuidanceBackend(StructuredOutputBackend): grammar_spec: str) -> StructuredOutputGrammar: self.serialized_grammar = serialize_guidance_grammar( request_type, grammar_spec, self.disable_any_whitespace, - self.no_additional_properties) + self.disable_additional_properties) ll_matcher = llguidance.LLMatcher( self.ll_tokenizer, @@ -171,11 +162,11 @@ def serialize_guidance_grammar( request_type: StructuredOutputOptions, grammar_spec: Union[str, dict[str, Any]], disable_any_whitespace: bool = False, - no_additional_properties: bool = False, + disable_additional_properties: bool = False, ) -> str: def _process_schema(grammar_spec: Union[str, dict[str, Any]], ) -> str: - if no_additional_properties: + if disable_additional_properties: grammar_spec = process_for_additional_properties(grammar_spec) return llguidance.LLMatcher.grammar_from_json_schema( grammar_spec, diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py index ecaeb6e4e..50a7d1683 100644 --- a/vllm/v1/structured_output/backend_xgrammar.py +++ b/vllm/v1/structured_output/backend_xgrammar.py @@ -9,7 +9,7 @@ import torch import vllm.envs from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.sampling_params import GuidedDecodingParams, SamplingParams +from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer from vllm.utils import LazyLoader @@ -37,16 +37,8 @@ class XgrammarBackend(StructuredOutputBackend): scheduler_config=vllm_config.scheduler_config, lora_config=vllm_config.lora_config) # type: ignore[arg-type] - self.disable_any_whitespace = False - backend_options = GuidedDecodingParams( - backend=vllm_config.decoding_config.guided_decoding_backend - ).backend_options() - for option in backend_options: - if option == "disable-any-whitespace": - self.disable_any_whitespace = True - else: - raise ValueError( - f"Unsupported option for the xgrammar backend: {option}") + self.disable_any_whitespace = \ + vllm_config.decoding_config.disable_any_whitespace tokenizer = tokenizer_group.get_lora_tokenizer(None) self.vocab_size = vllm_config.model_config.get_vocab_size() -- GitLab From 0350809f3a84d9bbc3586d3dcca98ba1400660c5 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 29 Apr 2025 20:52:25 +0100 Subject: [PATCH 038/461] Remove Falcon3 2x7B from CI (#17404) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/models/decoder_only/language/test_models.py | 7 +++---- tests/models/registry.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py index d35d87459..e55a682c0 100644 --- a/tests/models/decoder_only/language/test_models.py +++ b/tests/models/decoder_only/language/test_models.py @@ -29,7 +29,7 @@ AITER_MODEL_LIST = [ "openbmb/MiniCPM3-4B", "Qwen/Qwen-7B-Chat", "Qwen/Qwen2.5-0.5B-Instruct", - "ehristoforu/Falcon3-MoE-2x7B-Insruct", + "TitanML/tiny-mixtral", ] @@ -83,9 +83,8 @@ AITER_MODEL_LIST = [ pytest.param("stabilityai/stablelm-3b-4e1t"), # stablelm pytest.param("bigcode/starcoder2-3b"), # starcoder2 pytest.param( - "ehristoforu/Falcon3-MoE-2x7B-Insruct", # mixtral - marks=[pytest.mark.cpu_model, - large_gpu_mark(min_gb=48)], + "TitanML/tiny-mixtral", # mixtral + marks=[pytest.mark.cpu_model], ) ]) @pytest.mark.parametrize("max_tokens", [32]) diff --git a/tests/models/registry.py b/tests/models/registry.py index 8b330109d..75832d83d 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -191,7 +191,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { trust_remote_code=True), "MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"), "MixtralForCausalLM": _HfExamplesInfo("mistralai/Mixtral-8x7B-Instruct-v0.1", # noqa: E501 - {"falcon3": "ehristoforu/Falcon3-MoE-2x7B-Insruct"}), # noqa: E501 + {"tiny": "TitanML/tiny-mixtral"}), # noqa: E501 "QuantMixtralForCausalLM": _HfExamplesInfo("mistral-community/Mixtral-8x22B-v0.1-AWQ"), # noqa: E501 "MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False), "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"), -- GitLab From c9c1b59e59a35d5004e3914e23015617fc330b31 Mon Sep 17 00:00:00 2001 From: Dilip Gowda Bhagavan <110233170+dilipgb@users.noreply.github.com> Date: Wed, 30 Apr 2025 01:50:24 +0530 Subject: [PATCH 039/461] Fix: Python package installation for opentelmetry (#17049) Signed-off-by: Dilip Gowda Bhagavan --- docker/Dockerfile.s390x | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile.s390x b/docker/Dockerfile.s390x index 128929ac3..9c10cd56b 100644 --- a/docker/Dockerfile.s390x +++ b/docker/Dockerfile.s390x @@ -16,7 +16,7 @@ ENV LANG=C.UTF-8 \ RUN microdnf install -y \ which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \ libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \ - openssl-devel openblas openblas-devel autoconf automake libtool cmake && \ + openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy && \ microdnf clean all # Python Installation @@ -123,6 +123,7 @@ ENV UV_LINK_MODE=copy ENV CARGO_HOME=/root/.cargo ENV RUSTUP_HOME=/root/.rustup ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH" +ENV GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1 COPY . /workspace/vllm WORKDIR /workspace/vllm -- GitLab From 70788bdbdc590e7fbf9bddb3fa9bc92ac3181733 Mon Sep 17 00:00:00 2001 From: Bryan Lu <55512809+luyuzhe111@users.noreply.github.com> Date: Tue, 29 Apr 2025 14:10:00 -0700 Subject: [PATCH 040/461] [V1][Spec Decode] Apply torch.compile & cudagraph to EAGLE (#17211) Signed-off-by: Bryan Lu --- examples/offline_inference/eagle.py | 14 ++- vllm/compilation/backends.py | 15 ++- vllm/model_executor/models/llama_eagle.py | 25 +++-- vllm/model_executor/models/llama_eagle3.py | 5 +- vllm/v1/spec_decode/eagle.py | 122 +++++++++++++++++---- vllm/v1/worker/gpu_model_runner.py | 24 ++-- 6 files changed, 152 insertions(+), 53 deletions(-) diff --git a/examples/offline_inference/eagle.py b/examples/offline_inference/eagle.py index 474b745a6..91e2f68ec 100644 --- a/examples/offline_inference/eagle.py +++ b/examples/offline_inference/eagle.py @@ -36,6 +36,10 @@ def parse_args(): help="downloaded from the eagle repo " \ "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/" ) + parser.add_argument("--method", + type=str, + default='eagle', + choices=['eagle', 'eagle3']) parser.add_argument("--max_num_seqs", type=int, default=8) parser.add_argument("--num_prompts", type=int, default=80) parser.add_argument("--num_spec_tokens", type=int, default=2) @@ -53,7 +57,13 @@ def main(): args = parse_args() model_dir = "meta-llama/Llama-3.1-8B-Instruct" - eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B" + + if args.method == 'eagle': + eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B" + elif args.method == 'eagle3': + eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B" + else: + raise ValueError(f"unknown method: {args.method}") max_model_len = 2048 @@ -81,7 +91,7 @@ def main(): max_num_seqs=args.max_num_seqs, gpu_memory_utilization=0.8, speculative_config={ - "method": "eagle3" if "eagle3" in eagle_dir.lower() else "eagle", + "method": args.method, "model": eagle_dir, "num_speculative_tokens": args.num_spec_tokens, "draft_tensor_parallel_size": args.draft_tp, diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 7012131d0..a1570b7ec 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -347,8 +347,12 @@ class VllmBackend: PASS_KEY = "post_grad_custom_post_pass" if PASS_KEY in inductor_config: # Config should automatically wrap all inductor passes - assert isinstance(inductor_config[PASS_KEY], InductorPass) - self.post_grad_pass_manager.add(inductor_config[PASS_KEY]) + if isinstance(inductor_config[PASS_KEY], PostGradPassManager): + assert (inductor_config[PASS_KEY].uuid() == + self.post_grad_pass_manager.uuid()) + else: + assert isinstance(inductor_config[PASS_KEY], InductorPass) + self.post_grad_pass_manager.add(inductor_config[PASS_KEY]) inductor_config[PASS_KEY] = self.post_grad_pass_manager def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: @@ -408,8 +412,13 @@ class VllmBackend: ) self.compilation_config.cache_dir = cache_dir - cache_dir = self.compilation_config.cache_dir + if compilation_counter.num_graphs_seen > 0: + cache_dir = self.compilation_config.cache_dir + \ + f'-{compilation_counter.num_graphs_seen}' + else: + cache_dir = self.compilation_config.cache_dir os.makedirs(cache_dir, exist_ok=True) + self.compilation_config.cache_dir = cache_dir rank = vllm_config.parallel_config.rank dp_rank = vllm_config.parallel_config.data_parallel_rank local_cache_dir = os.path.join(cache_dir, f"rank_{rank}_{dp_rank}") diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py index 56e53ac2b..76655bd71 100644 --- a/vllm/model_executor/models/llama_eagle.py +++ b/vllm/model_executor/models/llama_eagle.py @@ -6,7 +6,8 @@ import torch import torch.nn as nn from transformers import LlamaConfig -from vllm.config import ModelConfig +from vllm.compilation.decorators import support_torch_compile +from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -37,17 +38,19 @@ class LlamaDecoderLayer(LlamaDecoderLayer): self.input_layernorm = nn.Identity() +@support_torch_compile class LlamaModel(nn.Module): def __init__( self, *, - model_config: ModelConfig, - start_layer_id: int = 0, + vllm_config: VllmConfig, prefix: str = "", + start_layer_id: int = 0, ) -> None: super().__init__() - self.config = model_config.hf_config + self.config = vllm_config. \ + speculative_config.draft_model_config.hf_config self.vocab_size = self.config.vocab_size self.embed_tokens = VocabParallelEmbedding( self.config.vocab_size, @@ -75,8 +78,7 @@ class LlamaModel(nn.Module): hidden_states = self.fc( torch.cat((input_embeds, hidden_states), dim=-1)) residual = None - for i in range(len(self.layers)): - layer = self.layers[i] + for layer in self.layers: hidden_states, residual = layer( positions, hidden_states, @@ -117,12 +119,13 @@ class LlamaModel(nn.Module): class EagleLlamaForCausalLM(LlamaForCausalLM): - def __init__(self, *, model_config: ModelConfig, start_layer_id: int = 0): + def __init__(self, *, vllm_config: VllmConfig, start_layer_id: int = 0): nn.Module.__init__(self) - self.config = model_config.hf_config - self.model = LlamaModel(model_config=model_config, - start_layer_id=start_layer_id, - prefix="model") + self.config = vllm_config. \ + speculative_config.draft_model_config.hf_config + self.model = LlamaModel(vllm_config=vllm_config, + prefix="model", + start_layer_id=start_layer_id) logit_scale = getattr(self.config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor(self.config.vocab_size, diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 0b18e4a8f..c42f19fee 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -6,7 +6,7 @@ import torch import torch.nn as nn from transformers import LlamaConfig -from vllm.config import ModelConfig +from vllm.config import ModelConfig, VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import QKVParallelLinear @@ -167,8 +167,9 @@ class LlamaModel(nn.Module): class Eagle3LlamaForCausalLM(LlamaForCausalLM): - def __init__(self, *, model_config: ModelConfig, start_layer_id: int = 0): + def __init__(self, *, vllm_config: VllmConfig, start_layer_id: int = 0): nn.Module.__init__(self) + model_config = vllm_config.speculative_config.draft_model_config self.config = model_config.hf_config self.model = LlamaModel(model_config=model_config, start_layer_id=start_layer_id, diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 8c45ca9a3..81508c2e0 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -4,7 +4,7 @@ import torch.nn as nn import triton import triton.language as tl -from vllm.config import VllmConfig, set_current_vllm_config +from vllm.config import CompilationLevel, VllmConfig, set_current_vllm_config from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.model_loader.loader import get_model_loader @@ -26,10 +26,41 @@ class EagleProposer: device: torch.device, ): self.vllm_config = vllm_config + self.method = self.vllm_config.speculative_config.method self.num_speculative_tokens = ( vllm_config.speculative_config.num_speculative_tokens) self.max_model_len = vllm_config.model_config.max_model_len self.block_size = vllm_config.cache_config.block_size + + self.dtype = vllm_config.model_config.dtype + + self.max_num_tokens = vllm_config.scheduler_config \ + .max_num_batched_tokens + + self.hidden_size = vllm_config.model_config.get_hidden_size() + + # TODO: make eagle3 compatible with cudagraph + self.use_cuda_graph = self.method != 'eagle3' and \ + (self.vllm_config.compilation_config.level + == CompilationLevel.PIECEWISE and + not self.vllm_config.model_config.enforce_eager) + + self.cudagraph_batch_sizes = list( + reversed( + self.vllm_config.compilation_config.cudagraph_capture_sizes)) + + # persistent buffers for cuda graph + self.input_ids = torch.zeros(self.max_num_tokens, + dtype=torch.int32, + device=device) + self.positions = torch.zeros(self.max_num_tokens, + dtype=torch.int64, + device=device) + + self.hidden_states = torch.zeros( + (self.max_num_tokens, self.hidden_size), + dtype=self.dtype, + device=device) # We need +1 here because the arange is used to set query_start_loc, # which has one more element than batch_size. self.arange = torch.arange(vllm_config.scheduler_config.max_num_seqs + @@ -59,13 +90,12 @@ class EagleProposer: batch_size = next_token_ids.shape[0] last_token_indices = cu_num_tokens[1:] - 1 - input_ids = torch.empty_like(target_token_ids) # Shift the input ids by one token. # E.g., [a1, b1, b2, c1, c2, c3] -> [b1, b2, c1, c2, c3, c3] - input_ids[:-1] = target_token_ids[1:] + self.input_ids[:num_tokens - 1] = target_token_ids[1:] # Replace the last token with the next token. # E.g., [b1, b2, c1, c2, c3, c3] -> [a2, b2, b3, c2, c3, c4] - input_ids[last_token_indices] = next_token_ids + self.input_ids[last_token_indices] = next_token_ids # FA requires seq_len to have dtype int32. seq_lens = (target_positions[last_token_indices] + 1).int() @@ -88,14 +118,30 @@ class EagleProposer: prefix_kv_lens=None, suffix_kv_lens=None, ) + if self.use_cuda_graph and \ + num_tokens <= self.cudagraph_batch_sizes[-1]: + num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens) + else: + num_input_tokens = num_tokens + # copy inputs to buffer for cudagraph + self.positions[:num_tokens] = target_positions - with set_forward_context(attn_metadata, self.vllm_config): - hidden_states_logits, hidden_states_fwd = self.model( - input_ids=input_ids, - hidden_states=target_hidden_states, - positions=target_positions, + if self.method == 'eagle': + self.hidden_states[:num_tokens] = target_hidden_states + hidden_states = self.hidden_states + else: + # TODO: make eagle3 compatible with cuda graph + hidden_states = target_hidden_states + + with set_forward_context(attn_metadata, + self.vllm_config, + num_tokens=num_input_tokens): + last_hidden_states, hidden_states = self.model( + input_ids=self.input_ids[:num_input_tokens], + positions=self.positions[:num_input_tokens], + hidden_states=hidden_states[:num_input_tokens], ) - sample_hidden_states = hidden_states_logits[last_token_indices] + sample_hidden_states = last_hidden_states[last_token_indices] logits = self.model.compute_logits(sample_hidden_states, None) draft_token_ids = logits.argmax(dim=-1) @@ -108,13 +154,20 @@ class EagleProposer: draft_token_ids_list = [draft_token_ids] positions = target_positions[last_token_indices] - hidden_states = hidden_states_fwd[last_token_indices] + hidden_states = hidden_states[last_token_indices] + if self.use_cuda_graph and \ + batch_size <= self.cudagraph_batch_sizes[-1]: + input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size) + else: + input_batch_size = batch_size attn_metadata.num_actual_tokens = batch_size attn_metadata.max_query_len = 1 attn_metadata.query_start_loc = self.arange[:batch_size + 1] for _ in range(self.num_speculative_tokens - 1): # Update the inputs. - input_ids = draft_token_ids_list[-1] + # cast to int32 is crucial when eagle model is compiled. + # tensor.argmax() returns int64 by default. + input_ids = draft_token_ids_list[-1].int() positions += 1 # NOTE(woosuk): We should handle the case where the draft model @@ -152,14 +205,27 @@ class EagleProposer: attn_metadata.slot_mapping.masked_fill_(exceeds_max_model_len, PADDING_SLOT_ID) + # copy inputs to buffer for cudagraph + self.input_ids[:batch_size] = input_ids + self.positions[:batch_size] = clamped_positions + + if self.method == 'eagle': + # TODO: make eagle3 compatible with cudagraph. + self.hidden_states[:batch_size] = hidden_states + hidden_states = self.hidden_states + # Run the model. - with set_forward_context(attn_metadata, self.vllm_config): - hidden_states_logits, hidden_states = self.model( - input_ids=input_ids, - hidden_states=hidden_states, - positions=clamped_positions, + with set_forward_context(attn_metadata, + self.vllm_config, + num_tokens=input_batch_size): + last_hidden_states, hidden_states = self.model( + input_ids=self.input_ids[:input_batch_size], + positions=self.positions[:input_batch_size], + hidden_states=hidden_states[:input_batch_size], ) - logits = self.model.compute_logits(hidden_states_logits, None) + hidden_states = hidden_states[:batch_size] + logits = self.model.compute_logits(last_hidden_states[:batch_size], + None) draft_token_ids = logits.argmax(dim=-1) draft_token_ids_list.append(draft_token_ids) @@ -227,13 +293,11 @@ class EagleProposer: draft_model_cls, arch = ModelRegistry.resolve_model_cls( draft_model_config.architectures) self.model = draft_model_cls( - model_config=draft_model_config, + vllm_config=self.vllm_config, start_layer_id=target_layer_num).to(target_device) loaded_weights = self.model.load_weights( - loader.get_all_weights( - self.vllm_config.speculative_config.draft_model_config, - self.model)) + loader.get_all_weights(draft_model_config, self.model)) if self.vllm_config.speculative_config.method == "eagle3": if "model.embed_tokens.weight" not in loaded_weights: logger.info( @@ -243,6 +307,20 @@ class EagleProposer: logger.info("Loading EAGLE LM head weights from the target model.") self.model.lm_head = target_model.lm_head + @torch.inference_mode() + def dummy_run( + self, + num_tokens: int, + ) -> None: + with set_forward_context(None, self.vllm_config, + num_tokens=num_tokens): + if self.method == 'eagle': + self.model( + input_ids=self.input_ids[:num_tokens], + positions=self.positions[:num_tokens], + hidden_states=self.hidden_states[:num_tokens], + ) + # NOTE(woosuk): Currently, the below code is not used and we always use argmax # to sample the draft tokens. We will use this after we find a way to manage diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4711beadb..41de305a0 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1106,7 +1106,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): # For mid-pipeline stages, return the hidden states. return hidden_states - hidden_states = hidden_states[:num_scheduled_tokens] sample_hidden_states = hidden_states[logits_indices] logits = self.model.compute_logits(sample_hidden_states, None) @@ -1172,7 +1171,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): # Compute prompt logprobs if needed. prompt_logprobs_dict = self._get_prompt_logprobs_dict( - hidden_states, + hidden_states[:num_scheduled_tokens], scheduler_output, ) @@ -1222,15 +1221,12 @@ class GPUModelRunner(LoRAModelRunnerMixin): if spec_decode_metadata is None: # input_ids can be None for multimodal models. - # We need to slice token_ids, positions, and hidden_states - # because the eagle head does not use cuda graph and should - # not include padding. target_token_ids = self.input_ids[:num_scheduled_tokens] target_positions = positions[:num_scheduled_tokens] if self.use_aux_hidden_state_outputs: - target_hidden_states = [ - h[:num_scheduled_tokens] for h in aux_hidden_states - ] + target_hidden_states = torch.cat( + [h[:num_scheduled_tokens] for h in aux_hidden_states], + dim=-1) else: target_hidden_states = hidden_states[:num_scheduled_tokens] target_slot_mapping = attn_metadata.slot_mapping @@ -1254,15 +1250,12 @@ class GPUModelRunner(LoRAModelRunnerMixin): target_token_ids = self.input_ids[token_indices] target_positions = positions[token_indices] if self.use_aux_hidden_state_outputs: - target_hidden_states = [ - h[token_indices] for h in aux_hidden_states - ] + target_hidden_states = torch.cat( + [h[token_indices] for h in aux_hidden_states], dim=-1) else: target_hidden_states = hidden_states[token_indices] target_slot_mapping = attn_metadata.slot_mapping[token_indices] - if self.use_aux_hidden_state_outputs: - target_hidden_states = torch.cat(target_hidden_states, dim=-1) draft_token_ids = self.drafter.propose( target_token_ids=target_token_ids, target_positions=target_positions, @@ -1506,6 +1499,11 @@ class GPUModelRunner(LoRAModelRunnerMixin): else: hidden_states = outputs + if self.use_spec_decode and \ + self.speculative_config.method in ('eagle', 'eagle3'): + assert isinstance(self.drafter, EagleProposer) + self.drafter.dummy_run(num_tokens) + logit_indices = np.cumsum(num_scheduled_tokens) - 1 return hidden_states[logit_indices] -- GitLab From 7489ec0bab2904dcc4001af59a942a16756fdbbc Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 29 Apr 2025 22:10:31 +0100 Subject: [PATCH 041/461] Remove Bamba 9B from CI (#17407) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/models/decoder_only/language/test_hybrid.py | 2 +- tests/models/registry.py | 3 ++- tests/v1/test_oracle.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/models/decoder_only/language/test_hybrid.py b/tests/models/decoder_only/language/test_hybrid.py index 5931c25b8..e5e0c28ae 100644 --- a/tests/models/decoder_only/language/test_hybrid.py +++ b/tests/models/decoder_only/language/test_hybrid.py @@ -28,7 +28,7 @@ HYBRID_MODELS = [ # not compatible with pip-compile. "pfnet/plamo-2-1b", "Zyphra/Zamba2-1.2B-instruct", - "ibm-ai-platform/Bamba-9B", + "hmellor/bamba-tiny-random", ] # Avoid OOM diff --git a/tests/models/registry.py b/tests/models/registry.py index 75832d83d..f17f70189 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -120,7 +120,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { trust_remote_code=True), "BaichuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan2-7B-chat", trust_remote_code=True), - "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B"), + "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B", + extras={"tiny": "hmellor/bamba-tiny-random"}), # noqa: E501 "BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m", {"1b": "bigscience/bloomz-1b1"}), "ChatGLMModel": _HfExamplesInfo("THUDM/chatglm3-6b", diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py index 94c8ad7c9..c34c673e9 100644 --- a/tests/v1/test_oracle.py +++ b/tests/v1/test_oracle.py @@ -12,7 +12,7 @@ UNSUPPORTED_MODELS_V1 = [ "openai/whisper-large-v3", # transcription "facebook/bart-large-cnn", # encoder decoder "mistralai/Mamba-Codestral-7B-v0.1", # mamba - "ibm-ai-platform/Bamba-9B", # hybrid + "hmellor/bamba-tiny-random", # hybrid "BAAI/bge-m3", # embedding ] -- GitLab From 34120f5acd693924cc783dfaa33733afbe9ae8b0 Mon Sep 17 00:00:00 2001 From: Benjamin Chislett Date: Tue, 29 Apr 2025 17:02:10 -0700 Subject: [PATCH 042/461] [V1][Feature] Enable Speculative Decoding with Structured Outputs (#14702) Signed-off-by: Benjamin Chislett Signed-off-by: Benjamin Chislett --- benchmarks/backend_request_func.py | 1 + .../benchmark_serving_structured_output.py | 9 ++- .../llm/test_struct_output_generate.py | 35 ++++++++-- vllm/v1/core/sched/scheduler.py | 17 +++-- vllm/v1/structured_output/__init__.py | 59 ++++++++++++---- vllm/v1/structured_output/backend_guidance.py | 21 ++++++ vllm/v1/structured_output/backend_types.py | 24 +++++++ vllm/v1/structured_output/backend_xgrammar.py | 32 ++++++++- vllm/v1/worker/gpu_model_runner.py | 70 +++++++++++-------- 9 files changed, 209 insertions(+), 59 deletions(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index efd51c79c..3405aaebf 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -260,6 +260,7 @@ async def async_request_openai_completions( if request_func_input.model_name else request_func_input.model, "prompt": request_func_input.prompt, "temperature": 0.0, + "repetition_penalty": 1.0, "max_tokens": request_func_input.output_len, "logprobs": request_func_input.logprobs, "stream": True, diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index 74ee00ec8..7c40e39ac 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -123,6 +123,8 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, copy.deepcopy(schema) for _ in range(args.num_prompts) ] for i in range(len(json_schemas)): + if "properties" not in json_schemas[i]: + json_schemas[i]["properties"] = {} json_schemas[i]["properties"][ f"__optional_field_{uuid.uuid4()}"] = { "type": @@ -134,7 +136,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, json_schemas = [schema] * args.num_prompts def gen_prompt(index: int): - return f"Generate an example of a user profile given the following schema: {json.dumps(get_schema(index))}" # noqa: E501 + return f"Generate an example of a brief user profile given the following schema: {json.dumps(get_schema(index))}" # noqa: E501 def get_schema(index: int): return json_schemas[index % len(json_schemas)] @@ -231,7 +233,8 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, idx -= len_dataset schema = dataset["schema"][idx] prompt = tokenizer.apply_chat_template(dataset["prompt"][idx], - tokenize=False) + tokenize=False, + add_generation_prompt=True) input_len = len(tokenizer(prompt).input_ids) completion = dataset["completion"][idx] @@ -849,7 +852,7 @@ if __name__ == "__main__": 'json', 'json-unique', 'grammar', 'regex', 'choice', 'xgrammar_bench' ]) - parser.add_argument("--json_schema_path", + parser.add_argument("--json-schema-path", type=str, default=None, help="Path to json schema.") diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index 29ec6088e..d25699591 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -16,13 +16,31 @@ from vllm.outputs import RequestOutput from vllm.platforms import current_platform from vllm.sampling_params import GuidedDecodingParams, SamplingParams +NGRAM_SPEC_CONFIG = { + "model": "[ngram]", + "num_speculative_tokens": 5, + "prompt_lookup_max": 5, + "prompt_lookup_min": 1, +} + +EAGLE_SPEC_CONFIG = { + "method": "eagle", + "model": "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", + "num_speculative_tokens": 5, +} + PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [ - ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "auto"), - ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto"), - ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "mistral"), - ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto"), + ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "auto", None), + ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None), + ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "mistral", None), + ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", None), #FIXME: This test is flaky on CI thus disabled #("Qwen/Qwen2.5-1.5B-Instruct", "guidance", "auto"), + ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", + NGRAM_SPEC_CONFIG), + ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", NGRAM_SPEC_CONFIG), + ("meta-llama/Meta-Llama-3.1-8B-Instruct", "xgrammar", "auto", + EAGLE_SPEC_CONFIG) ] PARAMS_MODELS_TOKENIZER_MODE = [ @@ -45,8 +63,9 @@ class CarDescription(BaseModel): @pytest.mark.skip_global_cleanup -@pytest.mark.parametrize("model_name, guided_decoding_backend, tokenizer_mode", - PARAMS_MODELS_BACKENDS_TOKENIZER_MODE) +@pytest.mark.parametrize( + "model_name, guided_decoding_backend, tokenizer_mode, speculative_config", + PARAMS_MODELS_BACKENDS_TOKENIZER_MODE) def test_structured_output( monkeypatch: pytest.MonkeyPatch, sample_json_schema: dict[str, Any], @@ -58,6 +77,7 @@ def test_structured_output( guided_decoding_backend: str, tokenizer_mode: str, model_name: str, + speculative_config: dict[str, Any], ): monkeypatch.setenv("VLLM_USE_V1", "1") @@ -71,7 +91,8 @@ def test_structured_output( max_model_len=1024, guided_decoding_backend=guided_decoding_backend, guided_decoding_disable_any_whitespace=True, - tokenizer_mode=tokenizer_mode) + tokenizer_mode=tokenizer_mode, + speculative_config=speculative_config) # # Test 1: Generate JSON output based on a provided schema diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 21711c929..7ebbb4954 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -441,7 +441,7 @@ class Scheduler(SchedulerInterface): grammar_bitmask = self.structured_output_manager.grammar_bitmask( self.requests, structured_output_request_ids, - len(self.running), + scheduled_spec_decode_tokens, ) # Construct the scheduler output. new_reqs_data = [ @@ -682,10 +682,6 @@ class Scheduler(SchedulerInterface): self.encoder_cache_manager.free_encoder_input( request, input_id) - # Add newly generated spec token ids to the request. - if spec_token_ids is not None: - request.spec_token_ids = spec_token_ids[req_index] - stopped = False new_logprobs = None new_token_ids = generated_token_ids @@ -717,6 +713,17 @@ class Scheduler(SchedulerInterface): request.structured_output_request.grammar.accept_tokens( # type: ignore[union-attr] req_id, new_token_ids) + # Add newly generated spec token ids to the request. + if spec_token_ids is not None: + if request.use_structured_output: + metadata = request.structured_output_request + assert metadata is not None and metadata.grammar is not None + # Needs to happen after new_token_ids are accepted. + request.spec_token_ids = metadata.grammar.validate_tokens( + spec_token_ids[req_index]) + else: + request.spec_token_ids = spec_token_ids[req_index] + # Get prompt logprobs for this request. prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id) if new_token_ids: diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 47ae4c4f0..3183edb7c 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -27,6 +27,7 @@ class StructuredOutputManager: def __init__(self, vllm_config: VllmConfig): self.backend: Optional[StructuredOutputBackend] = None self.vllm_config = vllm_config + self._grammar_bitmask: Optional[torch.Tensor] = None # The default max_workers if not specified is the number of CPUs * 5, @@ -80,7 +81,7 @@ class StructuredOutputManager: self, requests: dict[str, Request], structured_output_request_ids: dict[str, int], - batch_len: int, + scheduled_spec_decode_tokens: dict[str, list[int]], ) -> Optional[npt.NDArray[np.int32]]: # Prepare the structured output bitmask for this batch. if not structured_output_request_ids: @@ -88,20 +89,52 @@ class StructuredOutputManager: if self._grammar_bitmask is None: assert self.backend is not None - self._grammar_bitmask = self.backend.allocate_token_bitmask( - self.vllm_config.scheduler_config.max_num_seqs) - - # Fill the bitmask using the index of each request equal to its - # position in the batch. Resize the bitmask down to the size of - # the batch. - bitmask_tensor = self._grammar_bitmask - for req_id, batch_index in structured_output_request_ids.items(): + max_batch_size = self.vllm_config.scheduler_config.max_num_seqs + if self.vllm_config.speculative_config is not None: + max_num_spec_tokens = self.vllm_config.\ + speculative_config.num_speculative_tokens + else: + max_num_spec_tokens = 0 + + # Allocate a bitmask for each token needing to be checked: + # one for each speculative position, and one more for the + # bonus token / non-speculative token. + self._grammar_bitmask = \ + self.backend.allocate_token_bitmask( + max_batch_size * (1 + max_num_spec_tokens)) + + # Generate a batched bitmask for all structured output requests. + # When speculative decoding is enabled, we need to include multiple + # masks for each request, one for each possible bonus token position. + # These are stored inline in the tensor and unpacked by the gpu runner. + cumulative_index = 0 + ordered_seq = sorted(structured_output_request_ids.items(), + key=lambda x: x[1]) + # NOTE: This outer loop can likely be parallelized to improve + # performance of bitmask generation for large batches. + for req_id, _ in ordered_seq: request = requests[req_id].structured_output_request assert request is not None and request.grammar is not None - if not request.grammar.is_terminated(): - request.grammar.fill_bitmask(bitmask_tensor, batch_index) - if batch_len < self._grammar_bitmask.shape[0]: - bitmask_tensor = self._grammar_bitmask[:batch_len] + state_advancements = 0 + req_tokens = scheduled_spec_decode_tokens.get(req_id, []) + [None] + for i, token in enumerate(req_tokens): + if not request.grammar.is_terminated(): + request.grammar.fill_bitmask(self._grammar_bitmask, + cumulative_index) + if token is not None: + # In order to generate the correct bitmask for each + # position in the speculative sequence, we advance + # the FSM state for each speculative token and rollback + # to restore the previous state when we are finished. + assert request.grammar.accept_tokens(req_id, [token]) + state_advancements += 1 + cumulative_index += 1 + if state_advancements > 0: + request.grammar.rollback(state_advancements) + + bitmask_tensor = self._grammar_bitmask + if cumulative_index < self._grammar_bitmask.shape[0]: + bitmask_tensor = self._grammar_bitmask[:cumulative_index] # After finishing with the xgrammar operations, we convert to # np.ndarray, because that is much more efficient for serialization diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py index 8fb3e56bc..0ab175e78 100644 --- a/vllm/v1/structured_output/backend_guidance.py +++ b/vllm/v1/structured_output/backend_guidance.py @@ -144,6 +144,27 @@ class GuidanceGrammar(StructuredOutputGrammar): return r + def validate_tokens(self, tokens: list[int]) -> list[int]: + """Checks if the list of tokens are accepted by the parser in sequence. + Will not advance the parser. + + Returns the prefix list of tokens that are accepted by the parser. + """ + if len(tokens) == 0: + return [] + if self.ll_matcher.is_stopped(): + return [] + + num_tokens = self.ll_matcher.validate_tokens(tokens) + + self.check_error() + + return tokens[:num_tokens] + + def rollback(self, num_tokens: int) -> None: + self.ll_matcher.rollback(num_tokens) + self.check_error() + def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None: # this will automatically return [EOS] mask if the matcher is stopped # or otherwise in an error state diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py index 6330bcbf2..33ca9f8cf 100644 --- a/vllm/v1/structured_output/backend_types.py +++ b/vllm/v1/structured_output/backend_types.py @@ -35,6 +35,30 @@ class StructuredOutputGrammar(ABC): bool: True if the tokens are accepted, False otherwise. """ + @abstractmethod + def validate_tokens(self, tokens: list[int]) -> list[int]: + """ + Validates the provided tokens against the grammar. + Will not advance the FSM. + + Args: + tokens (list[int]): A list of token IDs to validate. + + Returns: + list[int]: A list of accepted token IDs. Will be a prefix + of the input tokens, and empty if none are accepted. + """ + + @abstractmethod + def rollback(self, num_tokens: int) -> None: + """ + Rolls back the state of the grammar by a specified number of tokens. + Will also revert counters for the number of processed tokens. + + Args: + num_tokens (int): The number of tokens to roll back. + """ + @abstractmethod def fill_bitmask(self, bitmask: torch.Tensor, batch_index: int) -> None: """ diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py index 50a7d1683..c82a3cab2 100644 --- a/vllm/v1/structured_output/backend_xgrammar.py +++ b/vllm/v1/structured_output/backend_xgrammar.py @@ -40,6 +40,11 @@ class XgrammarBackend(StructuredOutputBackend): self.disable_any_whitespace = \ vllm_config.decoding_config.disable_any_whitespace + self.num_speculative_tokens = 0 + if self.vllm_config.speculative_config is not None: + self.num_speculative_tokens = \ + self.vllm_config.speculative_config.num_speculative_tokens + tokenizer = tokenizer_group.get_lora_tokenizer(None) self.vocab_size = vllm_config.model_config.get_vocab_size() if isinstance(tokenizer, MistralTokenizer): @@ -118,7 +123,10 @@ class XgrammarBackend(StructuredOutputBackend): f"grammar is not of valid supported types. ({request_type!s})") return XgrammarGrammar( - matcher=xgr.GrammarMatcher(ctx), + matcher=xgr.GrammarMatcher( + ctx, + max_rollback_tokens=self.num_speculative_tokens, + ), vocab_size=self.vocab_size, ctx=ctx, ) @@ -136,7 +144,6 @@ class XgrammarGrammar(StructuredOutputGrammar): # supporting different backends, in the future. # For now, just xgrammar. # - # TODO: support max_rollback_tokens # https://xgrammar.mlc.ai/docs/api/python/index.html#xgrammar.GrammarMatcher.find_jump_forward_string # for jump-forward decoding @@ -163,6 +170,27 @@ class XgrammarGrammar(StructuredOutputGrammar): self.num_processed_tokens += 1 return True + def validate_tokens(self, tokens: list[int]) -> list[int]: + """Checks if the list of tokens are accepted by the FSM in sequence. + Will not advance the FSM. + + Returns the prefix list of tokens that are accepted by the FSM. + """ + accepted_tokens = [] + for token in tokens: + if self.matcher.accept_token(token): + accepted_tokens.append(token) + else: + break + if len(accepted_tokens) > 0: + # Rollback the FSM to the initial state + self.matcher.rollback(len(accepted_tokens)) + return accepted_tokens + + def rollback(self, num_tokens: int) -> None: + self.matcher.rollback(num_tokens) + self.num_processed_tokens -= num_tokens + def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None: self.matcher.fill_next_token_bitmask(bitmask, idx) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 41de305a0..97d8c91b4 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -957,46 +957,58 @@ class GPUModelRunner(LoRAModelRunnerMixin): scheduler_output: "SchedulerOutput", logits: torch.Tensor, ): - # Serialization of np.ndarray is much more efficient than a tensor, - # so we receive it in that format. grammar_bitmask = scheduler_output.grammar_bitmask if grammar_bitmask is None: return - # We receive the structured output bitmask from the scheduler, but the - # indices of the requests in the batch may not match the indices of - # the bitmask since the scheduler doesn't know how the gpu runner is - # ordering the requests in the batch. We need to sort the bitmask to - # match the order of the requests used here. + # We receive the structured output bitmask from the scheduler, + # compacted to contain bitmasks only for structured output requests. + # The order of the requests in the bitmask is not guaranteed to be the + # same as the order of the requests in the gpu runner's batch. We need + # to sort the bitmask to match the order of the requests used here. + + # Get the batch indices of the structured output requests. + # Keep track of the number of speculative tokens scheduled for every + # request in the batch, as the logit indices are offset by this amount. struct_out_req_batch_indices: dict[str, int] = {} - indices_match = True - for req_id in self.input_batch.req_ids: - mask_index = scheduler_output.structured_output_request_ids.get( - req_id) - if mask_index is None: - # not a structured output request - continue - batch_index = self.input_batch.req_id_to_index[req_id] - if batch_index != mask_index: - indices_match = False - struct_out_req_batch_indices[req_id] = batch_index - - if not indices_match: - # Sort the bitmask to match the order of the requests - sorted_bitmask = np.zeros_like(grammar_bitmask) - for req_id, batch_index in struct_out_req_batch_indices.items(): - orig_index = scheduler_output.structured_output_request_ids[ - req_id] - sorted_bitmask[batch_index] = grammar_bitmask[orig_index] - grammar_bitmask = sorted_bitmask + cumulative_offset = 0 + seq = sorted(self.input_batch.req_id_to_index.items(), + key=lambda x: x[1]) + for req_id, batch_index in seq: + logit_index = batch_index + cumulative_offset + cumulative_offset += len( + scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])) + if req_id in scheduler_output.structured_output_request_ids: + struct_out_req_batch_indices[req_id] = logit_index + + out_indices = [] + + # Reorder the bitmask to match the order of the requests in the batch. + sorted_bitmask = np.zeros_like(grammar_bitmask, + shape=(logits.shape[0], + grammar_bitmask.shape[1])) + cumulative_index = 0 + seq = sorted(scheduler_output.structured_output_request_ids.items(), + key=lambda x: x[1]) + for req_id, _ in seq: + logit_index = struct_out_req_batch_indices[req_id] + num_spec_tokens = len( + scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])) + for i in range(1 + num_spec_tokens): + sorted_bitmask[logit_index + i] = \ + grammar_bitmask[cumulative_index + i] + out_indices.append(logit_index + i) + cumulative_index += 1 + num_spec_tokens + grammar_bitmask = sorted_bitmask + # Serialization of np.ndarray is much more efficient than a tensor, + # so we receive it in that format. grammar_bitmask = torch.from_numpy(grammar_bitmask) - # TODO: compatibility with spec decode xgr.apply_token_bitmask_inplace( logits, grammar_bitmask.to(self.device, non_blocking=True), - indices=list(struct_out_req_batch_indices.values()), + indices=out_indices, ) @torch.inference_mode() -- GitLab From 4055130a85ee4ff5b618e056306d8f7924b3bd65 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Tue, 29 Apr 2025 17:52:11 -0700 Subject: [PATCH 043/461] [release] Always git fetch all to get latest tag on TPU release (#17322) --- .buildkite/release-pipeline.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index a21a657c4..642c0259c 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -57,6 +57,7 @@ steps: agents: queue: tpu_queue_postmerge commands: + - "git fetch --all" - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ." - "docker push vllm/vllm-tpu:nightly" - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT" -- GitLab From 1c2bc7ead019cdf5b04b2f1d07b00982352f85ef Mon Sep 17 00:00:00 2001 From: Gabriel Marinho <104592062+gmarinho2@users.noreply.github.com> Date: Tue, 29 Apr 2025 22:24:57 -0300 Subject: [PATCH 044/461] Truncation control for embedding models (#14776) Signed-off-by: Gabriel Marinho Signed-off-by: Max de Bayser Co-authored-by: Max de Bayser --- tests/entrypoints/openai/test_truncation.py | 103 ++++++++++++++++++ .../language/test_truncation_control.py | 69 ++++++++++++ vllm/engine/llm_engine.py | 3 + vllm/engine/protocol.py | 4 +- vllm/entrypoints/llm.py | 25 ++++- vllm/entrypoints/openai/protocol.py | 8 +- vllm/entrypoints/openai/serving_embedding.py | 14 +-- vllm/entrypoints/openai/serving_engine.py | 10 +- vllm/entrypoints/openai/serving_pooling.py | 14 +-- vllm/entrypoints/openai/serving_score.py | 15 +-- vllm/entrypoints/score_utils.py | 2 +- vllm/entrypoints/utils.py | 24 ++++ vllm/inputs/preprocess.py | 63 ++++++++--- vllm/sampling_params.py | 7 +- vllm/transformers_utils/tokenizer.py | 14 ++- vllm/transformers_utils/tokenizer_base.py | 2 + vllm/transformers_utils/tokenizer_group.py | 9 ++ vllm/transformers_utils/tokenizers/mistral.py | 2 + vllm/v1/engine/async_llm.py | 6 +- vllm/v1/engine/llm_engine.py | 4 +- vllm/v1/engine/processor.py | 4 +- 21 files changed, 332 insertions(+), 70 deletions(-) create mode 100644 tests/entrypoints/openai/test_truncation.py create mode 100644 tests/models/embedding/language/test_truncation_control.py diff --git a/tests/entrypoints/openai/test_truncation.py b/tests/entrypoints/openai/test_truncation.py new file mode 100644 index 000000000..137ed9db8 --- /dev/null +++ b/tests/entrypoints/openai/test_truncation.py @@ -0,0 +1,103 @@ +# SPDX-License-Identifier: Apache-2.0 +from typing import Any + +import openai +import pytest +import pytest_asyncio + +from tests.utils import RemoteOpenAIServer + +MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2" +max_model_len = 128 + +input = """Immerse yourself in the enchanting chronicle of calculus, a + mathematical domain that has radically transformed our comprehension of + change and motion. Despite its roots in ancient civilizations, the + formal birth of calculus predominantly occurred in the 17th century, + primarily under the influential guidance of Sir Isaac Newton and Gottfried + Wilhelm Leibniz. The earliest traces of calculus concepts are found in + ancient Greek mathematics,most notably in the works of Eudoxus and + Archimedes, around 300 BCE. They utilized the 'method of exhaustion'—a + technique for computing areas and volumes through the use of finite sums. + This methodology laid crucial foundational work for integral calculus. + In the 17th century, both Newton and Leibniz independently pioneered + calculus, each contributing unique perspectives that would shape this new + field.""" + + +@pytest.fixture(scope="module") +def server(): + args = [ + "--task", + "embed", + "--dtype", + "bfloat16", + "--enforce-eager", + "--max-model-len", + str(max_model_len), + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +@pytest.mark.asyncio +async def test_smaller_truncation_size(client: openai.AsyncOpenAI): + truncation_size = 10 + kwargs: dict[str, Any] = { + "model": MODEL_NAME, + "input": input, + "truncate_prompt_tokens": truncation_size + } + + response = await client.post(path="embeddings", + cast_to=object, + body={**kwargs}) + + assert response["usage"]["prompt_tokens"] == truncation_size + + +@pytest.mark.asyncio +async def test_bigger_truncation_size(client: openai.AsyncOpenAI): + truncation_size = max_model_len + 1 + kwargs: dict[str, Any] = { + "model": MODEL_NAME, + "input": input, + "truncate_prompt_tokens": truncation_size + } + + with pytest.raises(openai.BadRequestError) as err: + err = await client.post(path="embeddings", + cast_to=object, + body={**kwargs}) + + assert str(err) == f"""openai.BadRequestError: + Error code: 400 - {{'object': 'error', + 'message': 'truncate_prompt_tokens value + ({truncation_size}) + is greater than max_model_len ({max_model_len}). + Please, select a smaller truncation size.', + 'type': 'BadRequestError', + 'param': None, 'code': 400}}""" + + +@pytest.mark.asyncio +async def test_max_truncation_size(client: openai.AsyncOpenAI): + truncation_size = -1 + kwargs: dict[str, Any] = { + "model": MODEL_NAME, + "input": input, + "truncate_prompt_tokens": truncation_size + } + + response = await client.post(path="embeddings", + cast_to=object, + body={**kwargs}) + + assert response["usage"]["prompt_tokens"] == max_model_len diff --git a/tests/models/embedding/language/test_truncation_control.py b/tests/models/embedding/language/test_truncation_control.py new file mode 100644 index 000000000..a215e1ec5 --- /dev/null +++ b/tests/models/embedding/language/test_truncation_control.py @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: Apache-2.0 +import pytest + +MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2" +max_model_len = 128 + +input_str = """Immerse yourself in the enchanting chronicle of calculus, a + mathematical domain that has radically transformed our comprehension of + change and motion. Despite its roots in ancient civilizations, the + formal birth of calculus predominantly occurred in the 17th century, + primarily under the influential guidance of Sir Isaac Newton and Gottfried + Wilhelm Leibniz. The earliest traces of calculus concepts are found in + ancient Greek mathematics,most notably in the works of Eudoxus and + Archimedes, around 300 BCE. They utilized the 'method of exhaustion'—a + technique for computing areas and volumes through the use of finite sums. + This methodology laid crucial foundational work for integral calculus. + In the 17th century, both Newton and Leibniz independently pioneered + calculus, each contributing unique perspectives that would shape this new + field.""" + + +def test_smaller_truncation_size(vllm_runner, + model_name=MODEL_NAME, + input_str=input_str): + + truncate_prompt_tokens = 10 + + with vllm_runner(model_name, task="embed", + max_model_len=max_model_len) as vllm_model: + vllm_output = vllm_model.model.encode( + input_str, truncate_prompt_tokens=truncate_prompt_tokens) + + prompt_tokens = vllm_output[0].prompt_token_ids + + assert len(prompt_tokens) == truncate_prompt_tokens + + +def test_max_truncation_size(vllm_runner, + model_name=MODEL_NAME, + input_str=input_str): + truncate_prompt_tokens = -1 + + with vllm_runner(model_name, task="embed", + max_model_len=max_model_len) as vllm_model: + vllm_output = vllm_model.model.encode( + input_str, truncate_prompt_tokens=truncate_prompt_tokens) + + prompt_tokens = vllm_output[0].prompt_token_ids + + assert len(prompt_tokens) == max_model_len + + +def test_bigger_truncation_size(vllm_runner, + model_name=MODEL_NAME, + input_str=input_str): + + truncate_prompt_tokens = max_model_len + 1 + + with pytest.raises(ValueError), vllm_runner( + model_name, task="embed", + max_model_len=max_model_len) as vllm_model: + + llm_output = vllm_model.model.encode( + input_str, truncate_prompt_tokens=truncate_prompt_tokens) + + assert llm_output == f"""truncate_prompt_tokens value + ({truncate_prompt_tokens}) is greater than + max_model_len ({max_model_len}). Please, select + a smaller truncation size.""" diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 38f13d859..8481181eb 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -645,6 +645,7 @@ class LLMEngine: params: Union[SamplingParams, PoolingParams], arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, + tokenization_kwargs: Optional[dict[str, Any]] = None, trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, @@ -678,6 +679,7 @@ class LLMEngine: params: Optional[Union[SamplingParams, PoolingParams]] = None, arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, + tokenization_kwargs: Optional[dict[str, Any]] = None, trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, @@ -758,6 +760,7 @@ class LLMEngine: processed_inputs = self.input_preprocessor.preprocess( prompt, + tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, ) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 7e5ac3a28..5632e8ad4 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -2,7 +2,7 @@ import asyncio from abc import ABC, abstractmethod -from typing import AsyncGenerator, List, Mapping, Optional +from typing import AsyncGenerator, Mapping, Optional from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function from vllm.config import DecodingConfig, ModelConfig, VllmConfig @@ -256,7 +256,7 @@ class EngineClient(ABC): async def do_log_stats( self, scheduler_outputs: Optional[SchedulerOutputs] = None, - model_output: Optional[List[SamplerOutput]] = None, + model_output: Optional[list[SamplerOutput]] = None, ) -> None: ... diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 948e8f36e..f1f48c700 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -25,6 +25,7 @@ from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, resolve_chat_template_content_format) from vllm.entrypoints.score_utils import (_cosine_similarity, _validate_score_input_lens) +from vllm.entrypoints.utils import _validate_truncation_size from vllm.inputs import PromptType, SingletonPrompt, TextPrompt, TokensPrompt from vllm.inputs.parse import is_token_prompt, parse_and_batch_prompt from vllm.logger import init_logger @@ -793,6 +794,7 @@ class LLM: pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None, *, + truncate_prompt_tokens: Optional[int] = None, use_tqdm: bool = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, @@ -807,6 +809,7 @@ class LLM: pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None, prompt_token_ids: Optional[list[int]] = None, + truncate_prompt_tokens: Optional[int] = None, use_tqdm: bool = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, @@ -821,6 +824,7 @@ class LLM: pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None, prompt_token_ids: Optional[list[list[int]]] = None, + truncate_prompt_tokens: Optional[int] = None, use_tqdm: bool = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, @@ -836,6 +840,7 @@ class LLM: Sequence[PoolingParams]]] = None, *, prompt_token_ids: list[int], + truncate_prompt_tokens: Optional[int] = None, use_tqdm: bool = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, @@ -851,6 +856,7 @@ class LLM: Sequence[PoolingParams]]] = None, *, prompt_token_ids: list[list[int]], + truncate_prompt_tokens: Optional[int] = None, use_tqdm: bool = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, @@ -864,6 +870,7 @@ class LLM: prompts: None, pooling_params: None, prompt_token_ids: Union[list[int], list[list[int]]], + truncate_prompt_tokens: Optional[int] = None, use_tqdm: bool = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, @@ -882,6 +889,7 @@ class LLM: pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None, prompt_token_ids: Optional[Union[list[int], list[list[int]]]] = None, + truncate_prompt_tokens: Optional[int] = None, use_tqdm: bool = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, @@ -946,10 +954,15 @@ class LLM: for pooling_param in pooling_params: pooling_param.verify(self.llm_engine.model_config) + tokenization_kwargs: dict[str, Any] = {} + _validate_truncation_size(self.llm_engine.model_config.max_model_len, + truncate_prompt_tokens, tokenization_kwargs) + self._validate_and_add_requests( prompts=parsed_prompts, params=pooling_params, lora_request=lora_request, + tokenization_kwargs=tokenization_kwargs, prompt_adapter_request=prompt_adapter_request, ) @@ -962,6 +975,7 @@ class LLM: prompts: Union[PromptType, Sequence[PromptType]], /, *, + truncate_prompt_tokens: Optional[int] = None, use_tqdm: bool = True, pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None, @@ -995,6 +1009,7 @@ class LLM: "Embedding API is only enabled for `--task embed`") items = self.encode(prompts, + truncate_prompt_tokens=truncate_prompt_tokens, use_tqdm=use_tqdm, pooling_params=pooling_params, lora_request=lora_request, @@ -1055,6 +1070,7 @@ class LLM: encoded_output: list[PoolingRequestOutput] = self.encode( text_1 + text_2, + truncate_prompt_tokens=truncate_prompt_tokens, use_tqdm=use_tqdm, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request) @@ -1098,9 +1114,8 @@ class LLM: pooling_params = PoolingParams() tokenization_kwargs: dict[str, Any] = {} - if truncate_prompt_tokens is not None: - tokenization_kwargs["truncation"] = True - tokenization_kwargs["max_length"] = truncate_prompt_tokens + _validate_truncation_size(self.llm_engine.model_config.max_model_len, + truncate_prompt_tokens, tokenization_kwargs) parsed_prompts = [] @@ -1323,6 +1338,7 @@ class LLM: Sequence[PoolingParams]], lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]], prompt_adapter_request: Optional[PromptAdapterRequest], + tokenization_kwargs: Optional[dict[str, Any]] = None, guided_options: Optional[GuidedDecodingRequest] = None, priority: Optional[list[int]] = None, ) -> None: @@ -1359,6 +1375,7 @@ class LLM: self._add_request( prompt, params[i] if isinstance(params, Sequence) else params, + tokenization_kwargs=tokenization_kwargs, lora_request=lora_request[i] if isinstance( lora_request, Sequence) else lora_request, prompt_adapter_request=prompt_adapter_request, @@ -1369,6 +1386,7 @@ class LLM: self, prompt: PromptType, params: Union[SamplingParams, PoolingParams], + tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, @@ -1379,6 +1397,7 @@ class LLM: prompt, params, lora_request=lora_request, + tokenization_kwargs=tokenization_kwargs, prompt_adapter_request=prompt_adapter_request, priority=priority, ) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 015943762..d444442a9 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1014,7 +1014,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel): encoding_format: Literal["float", "base64"] = "float" dimensions: Optional[int] = None user: Optional[str] = None - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None # doc: begin-embedding-pooling-params additional_data: Optional[Any] = None @@ -1049,7 +1049,7 @@ class EmbeddingChatRequest(OpenAIBaseModel): encoding_format: Literal["float", "base64"] = "float" dimensions: Optional[int] = None user: Optional[str] = None - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None # doc: begin-chat-embedding-pooling-params additional_data: Optional[Any] = None @@ -1116,7 +1116,7 @@ class ScoreRequest(OpenAIBaseModel): model: Optional[str] = None text_1: Union[list[str], str] text_2: Union[list[str], str] - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None # doc: begin-score-pooling-params additional_data: Optional[Any] = None @@ -1142,7 +1142,7 @@ class RerankRequest(OpenAIBaseModel): query: str documents: list[str] top_n: int = Field(default_factory=lambda: 0) - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None # doc: begin-rerank-pooling-params additional_data: Optional[Any] = None diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index ba960de17..4b4d2d8b7 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -21,6 +21,7 @@ from vllm.entrypoints.openai.protocol import (EmbeddingChatRequest, ErrorResponse, UsageInfo) from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.entrypoints.utils import _validate_truncation_size from vllm.logger import init_logger from vllm.outputs import (EmbeddingOutput, EmbeddingRequestOutput, PoolingRequestOutput) @@ -85,16 +86,7 @@ class OpenAIServingEmbedding(OpenAIServing): request_id = f"embd-{self._base_request_id(raw_request)}" created_time = int(time.time()) - truncate_prompt_tokens = None - - if request.truncate_prompt_tokens is not None: - if request.truncate_prompt_tokens <= self.max_model_len: - truncate_prompt_tokens = request.truncate_prompt_tokens - else: - return self.create_error_response( - "truncate_prompt_tokens value is " - "greater than max_model_len." - " Please, select a smaller truncation size.") + truncate_prompt_tokens = request.truncate_prompt_tokens pooling_params = request.to_pooling_params() @@ -104,6 +96,8 @@ class OpenAIServingEmbedding(OpenAIServing): return self.create_error_response(str(e)) try: + truncate_prompt_tokens = _validate_truncation_size( + self.max_model_len, truncate_prompt_tokens) ( lora_request, prompt_adapter_request, diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 49b346a23..c3121eff5 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -173,7 +173,7 @@ class OpenAIServing: request: AnyRequest, tokenizer: AnyTokenizer, prompt: str, - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]], + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]], add_special_tokens: bool, ) -> TextTokensPrompt: if (self.model_config.encoder_config is not None @@ -271,7 +271,7 @@ class OpenAIServing: request: AnyRequest, tokenizer: AnyTokenizer, prompt_input: Union[str, list[int]], - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None, add_special_tokens: bool = True, ) -> TextTokensPrompt: """ @@ -292,7 +292,7 @@ class OpenAIServing: request: AnyRequest, tokenizer: AnyTokenizer, prompt_inputs: Iterable[Union[str, list[int]]], - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None, add_special_tokens: bool = True, ) -> Iterator[TextTokensPrompt]: """ @@ -321,7 +321,7 @@ class OpenAIServing: request: AnyRequest, tokenizer: AnyTokenizer, input_or_inputs: Union[str, list[str], list[int], list[list[int]]], - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None, add_special_tokens: bool = True, ) -> list[TextTokensPrompt]: """ @@ -356,7 +356,7 @@ class OpenAIServing: request: CompletionLikeRequest, tokenizer: AnyTokenizer, input_or_inputs: Union[str, list[str], list[int], list[list[int]]], - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None, add_special_tokens: bool = True, ) -> tuple[list[TextTokensPrompt], list[TokensPrompt]]: request_prompts = await self._tokenize_prompt_input_or_inputs_async( diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index 779a3eded..7c401d4f5 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -21,6 +21,7 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse, PoolingResponseData, UsageInfo) from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.entrypoints.utils import _validate_truncation_size from vllm.logger import init_logger from vllm.outputs import PoolingOutput, PoolingRequestOutput from vllm.utils import merge_async_iterators @@ -85,18 +86,11 @@ class OpenAIServingPooling(OpenAIServing): request_id = f"pool-{self._base_request_id(raw_request)}" created_time = int(time.time()) - truncate_prompt_tokens = None - - if request.truncate_prompt_tokens is not None: - if request.truncate_prompt_tokens <= self.max_model_len: - truncate_prompt_tokens = request.truncate_prompt_tokens - else: - return self.create_error_response( - "truncate_prompt_tokens value is " - "greater than max_model_len." - " Please, select a smaller truncation size.") + truncate_prompt_tokens = request.truncate_prompt_tokens try: + truncate_prompt_tokens = _validate_truncation_size( + self.max_model_len, truncate_prompt_tokens) ( lora_request, prompt_adapter_request, diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index 73b4288cb..9bdacb551 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -18,6 +18,7 @@ from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.score_utils import (_cosine_similarity, _validate_score_input_lens) +from vllm.entrypoints.utils import _validate_truncation_size from vllm.inputs.data import TokensPrompt from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -231,11 +232,6 @@ class ServingScores(OpenAIServing): truncate_prompt_tokens: Optional[int] = None, ) -> list[PoolingRequestOutput]: - tokenization_kwargs: dict[str, Any] = {} - if truncate_prompt_tokens is not None: - tokenization_kwargs["truncation"] = True - tokenization_kwargs["max_length"] = truncate_prompt_tokens - ( lora_request, prompt_adapter_request, @@ -247,12 +243,9 @@ class ServingScores(OpenAIServing): tokenizer = await self.engine_client.get_tokenizer(lora_request) - if truncate_prompt_tokens is not None and \ - truncate_prompt_tokens > self.max_model_len: - raise ValueError( - f"truncate_prompt_tokens value ({truncate_prompt_tokens}) " - f"is greater than max_model_len ({self.max_model_len})." - f" Please, select a smaller truncation size.") + tokenization_kwargs: dict[str, Any] = {} + _validate_truncation_size(self.max_model_len, truncate_prompt_tokens, + tokenization_kwargs) trace_headers = (None if raw_request is None else await self._get_trace_headers(raw_request.headers)) diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py index 53411a27b..80b6c07c6 100644 --- a/vllm/entrypoints/score_utils.py +++ b/vllm/entrypoints/score_utils.py @@ -46,4 +46,4 @@ def _validate_score_input_lens( if len(texts_1) == 0: raise ValueError("At least one text element must be given") if len(texts_2) == 0: - raise ValueError("At least one text_pair element must be given") + raise ValueError("At least one text_pair element must be given") \ No newline at end of file diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index b88c2b3a0..2fe6e1a9e 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -3,6 +3,7 @@ import asyncio import functools import os +from typing import Any, Optional from fastapi import Request from fastapi.responses import JSONResponse, StreamingResponse @@ -134,3 +135,26 @@ def cli_env_setup(): if "VLLM_WORKER_MULTIPROC_METHOD" not in os.environ: logger.debug("Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'") os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + + +def _validate_truncation_size( + max_model_len: int, + truncate_prompt_tokens: Optional[int], + tokenization_kwargs: Optional[dict[str, Any]] = None, +) -> Optional[int]: + + if truncate_prompt_tokens is not None: + if truncate_prompt_tokens <= -1: + truncate_prompt_tokens = max_model_len + + if truncate_prompt_tokens > max_model_len: + raise ValueError( + f"truncate_prompt_tokens value ({truncate_prompt_tokens}) " + f"is greater than max_model_len ({max_model_len})." + f" Please, select a smaller truncation size.") + + if tokenization_kwargs is not None: + tokenization_kwargs["truncation"] = True + tokenization_kwargs["max_length"] = truncate_prompt_tokens + + return truncate_prompt_tokens diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 0edb6da06..56b60b893 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -2,7 +2,7 @@ import asyncio from collections.abc import Mapping -from typing import Optional, Union, cast +from typing import Any, Optional, Union, cast from typing_extensions import assert_never @@ -183,18 +183,21 @@ class InputPreprocessor: self, prompt: str, lora_request: Optional[LoRARequest], + tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> list[int]: """ Apply the model's tokenizer to a text prompt, returning the corresponding token IDs. """ tokenizer = self.get_tokenizer_group() - add_special_tokens = None + if tokenization_kwargs is None: + tokenization_kwargs = {} + if self.model_config.hf_config.model_type == "whisper": # For Whisper, special tokens should be provided by the user based # on the task and language of their request. Also needed to avoid # appending an EOS token to the prompt which disrupts generation. - add_special_tokens = False + tokenization_kwargs["add_special_tokens"] = False if (self.model_config.encoder_config is not None and self.model_config.encoder_config.get( @@ -203,25 +206,27 @@ class InputPreprocessor: return tokenizer.encode(prompt=prompt, lora_request=lora_request, - add_special_tokens=add_special_tokens) + **tokenization_kwargs) async def _tokenize_prompt_async( self, prompt: str, lora_request: Optional[LoRARequest], + tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> list[int]: """Async version of :meth:`_tokenize_prompt`.""" tokenizer = self.get_tokenizer_group() - add_special_tokens = None + if tokenization_kwargs is None: + tokenization_kwargs = {} + if self.model_config.hf_config.model_type == "whisper": # For Whisper, special tokens should be provided by the user based # on the task and language of their request. Also needed to avoid # appending an EOS token to the prompt which disrupts generation. - add_special_tokens = False - return await tokenizer.encode_async( - prompt=prompt, - lora_request=lora_request, - add_special_tokens=add_special_tokens) + tokenization_kwargs["add_special_tokens"] = False + return await tokenizer.encode_async(prompt=prompt, + lora_request=lora_request, + **tokenization_kwargs) def _process_multimodal( self, @@ -281,6 +286,7 @@ class InputPreprocessor: def _prompt_to_llm_inputs( self, prompt: SingletonPrompt, + tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, return_mm_hashes: bool = False, ) -> SingletonInputs: @@ -304,6 +310,7 @@ class InputPreprocessor: prompt_token_ids = self._tokenize_prompt( prompt_text, lora_request=lora_request, + tokenization_kwargs=tokenization_kwargs, ) return token_inputs( @@ -352,6 +359,7 @@ class InputPreprocessor: prompt_token_ids = self._tokenize_prompt( prompt_text, lora_request=lora_request, + tokenization_kwargs=tokenization_kwargs, ) return token_inputs( @@ -364,6 +372,7 @@ class InputPreprocessor: async def _prompt_to_llm_inputs_async( self, prompt: SingletonPrompt, + tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, return_mm_hashes: bool = False, ) -> SingletonInputs: @@ -375,6 +384,7 @@ class InputPreprocessor: prompt_token_ids = await self._tokenize_prompt_async( prompt_text, lora_request=lora_request, + tokenization_kwargs=tokenization_kwargs, ) return token_inputs( @@ -517,6 +527,7 @@ class InputPreprocessor: def _process_encoder_decoder_prompt( self, prompt: PromptType, + tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> EncoderDecoderInputs: """ For encoder/decoder models only: @@ -553,7 +564,9 @@ class InputPreprocessor: if is_explicit_encoder_decoder_prompt(prompt): encoder_inputs = self._prompt_to_llm_inputs( - prompt["encoder_prompt"]) + prompt["encoder_prompt"], + tokenization_kwargs=tokenization_kwargs, + ) if (decoder_input := prompt["decoder_prompt"]) is None: decoder_inputs = None else: @@ -565,7 +578,10 @@ class InputPreprocessor: self._separate_enc_dec_inputs_from_mm_processor_outputs( encoder_inputs, decoder_inputs)) else: - inputs = self._prompt_to_llm_inputs(prompt) + inputs = self._prompt_to_llm_inputs( + prompt, + tokenization_kwargs=tokenization_kwargs, + ) if self.model_config.is_multimodal_model: # Encoder-Decoder Multimodal model encoder_inputs, decoder_inputs = ( @@ -581,6 +597,7 @@ class InputPreprocessor: async def _process_encoder_decoder_prompt_async( self, prompt: PromptType, + tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> EncoderDecoderInputs: """Async version of :meth:`_process_encoder_decoder_prompt`.""" encoder_inputs: SingletonInputs @@ -588,13 +605,18 @@ class InputPreprocessor: if is_explicit_encoder_decoder_prompt(prompt): encoder_task = self._prompt_to_llm_inputs_async( - prompt["encoder_prompt"]) + prompt["encoder_prompt"], + tokenization_kwargs=tokenization_kwargs, + ) if (decoder_input := prompt["decoder_prompt"]) is None: encoder_inputs = await encoder_task decoder_inputs = None else: - decoder_task = self._prompt_to_llm_inputs_async(decoder_input) + decoder_task = self._prompt_to_llm_inputs_async( + decoder_input, + tokenization_kwargs=tokenization_kwargs, + ) encoder_inputs, decoder_inputs = await asyncio.gather( encoder_task, decoder_task) @@ -606,7 +628,10 @@ class InputPreprocessor: self._separate_enc_dec_inputs_from_mm_processor_outputs( encoder_inputs, decoder_inputs)) else: - inputs = await self._prompt_to_llm_inputs_async(prompt) + inputs = await self._prompt_to_llm_inputs_async( + prompt, + tokenization_kwargs=tokenization_kwargs, + ) if self.model_config.is_multimodal_model: # Encoder-Decoder Multimodal model encoder_inputs, decoder_inputs = ( @@ -638,6 +663,7 @@ class InputPreprocessor: def _process_decoder_only_prompt( self, prompt: SingletonPrompt, + tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, return_mm_hashes: bool = False, @@ -660,6 +686,7 @@ class InputPreprocessor: prompt_comps = self._prompt_to_llm_inputs( prompt, + tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, return_mm_hashes=return_mm_hashes, ) @@ -672,6 +699,7 @@ class InputPreprocessor: async def _process_decoder_only_prompt_async( self, prompt: SingletonPrompt, + tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, return_mm_hashes: bool = False, @@ -679,6 +707,7 @@ class InputPreprocessor: """Async version of :meth:`_process_decoder_only_prompt`.""" prompt_comps = await self._prompt_to_llm_inputs_async( prompt, + tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, return_mm_hashes=return_mm_hashes, ) @@ -691,6 +720,7 @@ class InputPreprocessor: def preprocess( self, prompt: PromptType, + tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, return_mm_hashes: bool = False, @@ -711,6 +741,7 @@ class InputPreprocessor: # Decoder-only operation return self._process_decoder_only_prompt( prompt, + tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, return_mm_hashes=return_mm_hashes, @@ -719,6 +750,7 @@ class InputPreprocessor: async def preprocess_async( self, prompt: PromptType, + tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, return_mm_hashes: bool = False, @@ -739,6 +771,7 @@ class InputPreprocessor: # Decoder-only operation return await self._process_decoder_only_prompt_async( prompt, + tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, return_mm_hashes=return_mm_hashes, diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 511571d05..3ac5c5c3d 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -186,9 +186,10 @@ class SamplingParams( logits_processors: list of functions that modify logits based on previously generated tokens, and optionally prompt tokens as a first argument. - truncate_prompt_tokens: If set to an integer k, will use only the last k - tokens from the prompt (i.e., left truncation). Defaults to None - (i.e., no truncation). + truncate_prompt_tokens: If set to -1, will use the truncation size + supported by the model. If set to an integer k, will use only + the last k tokens from the prompt (i.e., left truncation). + Defaults to None (i.e., no truncation). guided_decoding: If provided, the engine will construct a guided decoding logits processor from these parameters. Defaults to None. logit_bias: If provided, the engine will construct a logits processor diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index da5bec856..57b9242b8 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -55,6 +55,8 @@ def encode_tokens( tokenizer: AnyTokenizer, text: str, *, + truncation: Optional[bool] = None, + max_length: Optional[int] = None, add_special_tokens: Optional[bool] = None, ) -> list[int]: """ @@ -64,10 +66,18 @@ def encode_tokens( :code:`add_special_tokens=None` means to use the backend's default settings. """ + + kw_args: dict[str, Any] = {} + if max_length is not None: + kw_args["max_length"] = max_length + + if truncation is not None: + kw_args["truncation"] = truncation + if add_special_tokens is not None: - return tokenizer.encode(text, add_special_tokens=add_special_tokens) + kw_args["add_special_tokens"] = add_special_tokens - return tokenizer.encode(text) + return tokenizer.encode(text, **kw_args) def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer: diff --git a/vllm/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py index b4eb081c9..d69e5a6b4 100644 --- a/vllm/transformers_utils/tokenizer_base.py +++ b/vllm/transformers_utils/tokenizer_base.py @@ -94,6 +94,8 @@ class TokenizerBase(ABC): @abstractmethod def encode(self, text: str, + truncation: Optional[bool] = None, + max_length: Optional[int] = None, add_special_tokens: Optional[bool] = None) -> list[int]: raise NotImplementedError() diff --git a/vllm/transformers_utils/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group.py index a829985cb..aff2d2eb1 100644 --- a/vllm/transformers_utils/tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group.py @@ -45,11 +45,16 @@ class TokenizerGroup: def encode(self, prompt: str, + max_length: Optional[int] = None, + truncation: Optional[bool] = None, lora_request: Optional[LoRARequest] = None, add_special_tokens: Optional[bool] = None) -> List[int]: + tokenizer = self.get_lora_tokenizer(lora_request) ret = encode_tokens(tokenizer, prompt, + max_length=max_length, + truncation=truncation, add_special_tokens=add_special_tokens) self._raise_if_input_too_long(ret, lora_request) return ret @@ -57,11 +62,15 @@ class TokenizerGroup: async def encode_async( self, prompt: str, + max_length: Optional[int] = None, + truncation: Optional[bool] = None, lora_request: Optional[LoRARequest] = None, add_special_tokens: Optional[bool] = None) -> List[int]: tokenizer = await self.get_lora_tokenizer_async(lora_request) ret = encode_tokens(tokenizer, prompt, + max_length=max_length, + truncation=truncation, add_special_tokens=add_special_tokens) self._raise_if_input_too_long(ret, lora_request) return ret diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 296149a45..6d4655781 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -359,6 +359,8 @@ class MistralTokenizer(TokenizerBase): def encode(self, text: str, + truncation: Optional[bool] = None, + max_length: Optional[int] = None, add_special_tokens: Optional[bool] = None) -> List[int]: # `encode` should only be used for prompt completion # it should never be used for chat_completion. diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 1334fb789..2562fcc9c 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -2,7 +2,7 @@ import asyncio from collections.abc import AsyncGenerator, Mapping from copy import copy -from typing import Optional, Union +from typing import Any, Optional, Union import numpy as np @@ -201,6 +201,7 @@ class AsyncLLM(EngineClient): params: Union[SamplingParams, PoolingParams], arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, + tokenization_kwargs: Optional[dict[str, Any]] = None, trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, @@ -219,7 +220,8 @@ class AsyncLLM(EngineClient): # Convert Input --> Request. prompt_str, request = self.processor.process_inputs( request_id, prompt, params, arrival_time, lora_request, - trace_headers, prompt_adapter_request, priority) + tokenization_kwargs, trace_headers, prompt_adapter_request, + priority) if params.n == 1: await self._add_request(request, prompt_str, None, 0, queue) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 85da58451..b471b1536 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -175,6 +175,7 @@ class LLMEngine: params: Union[SamplingParams, PoolingParams], arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, + tokenization_kwargs: Optional[dict[str, Any]] = None, trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, @@ -182,7 +183,8 @@ class LLMEngine: # Process raw inputs into the request. prompt_str, request = self.processor.process_inputs( request_id, prompt, params, arrival_time, lora_request, - trace_headers, prompt_adapter_request, priority) + tokenization_kwargs, trace_headers, prompt_adapter_request, + priority) n = params.n if isinstance(params, SamplingParams) else 1 diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 8ae5d0157..b98a31773 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -2,7 +2,7 @@ import time from collections.abc import Mapping, Sequence -from typing import Literal, Optional, Union +from typing import Any, Literal, Optional, Union from vllm.config import VllmConfig from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs @@ -198,6 +198,7 @@ class Processor: params: Union[SamplingParams, PoolingParams], arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, + tokenization_kwargs: Optional[dict[str, Any]] = None, trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, @@ -224,6 +225,7 @@ class Processor: # 3. Apply prompt adapter to prompt token ids if one exists. processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess( prompt, + tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, return_mm_hashes=self.use_hash, -- GitLab From 2c4f59afc3d50fda805c4ad94c9d9be168cded0b Mon Sep 17 00:00:00 2001 From: Huy Do Date: Tue, 29 Apr 2025 19:08:04 -0700 Subject: [PATCH 045/461] Update PyTorch to 2.7.0 (#16859) --- .buildkite/release-pipeline.yaml | 10 ++-- .buildkite/scripts/upload-wheels.sh | 18 ++++---- .buildkite/test-pipeline.yaml | 2 +- .github/workflows/lint-and-deploy.yaml | 4 +- .pre-commit-config.yaml | 2 +- CMakeLists.txt | 4 +- docker/Dockerfile | 46 +++++++++++++------ .../installation/gpu/cuda.inc.md | 6 +-- .../online_serving/chart-helm/values.yaml | 2 +- pyproject.toml | 2 +- requirements/build.txt | 2 +- requirements/cpu.txt | 11 +++-- requirements/cuda.txt | 9 ++-- requirements/rocm-build.txt | 6 +-- requirements/test.in | 6 +-- requirements/test.txt | 44 ++++++++++-------- setup.py | 2 +- vllm/attention/ops/ipex_attn.py | 3 +- 18 files changed, 102 insertions(+), 77 deletions(-) diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 642c0259c..03e2267a1 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -1,20 +1,20 @@ steps: - - label: "Build wheel - CUDA 12.4" + - label: "Build wheel - CUDA 12.8" agents: queue: cpu_queue_postmerge commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "bash .buildkite/scripts/upload-wheels.sh" env: DOCKER_BUILDKIT: "1" - - label: "Build wheel - CUDA 12.1" + - label: "Build wheel - CUDA 12.6" agents: queue: cpu_queue_postmerge commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "bash .buildkite/scripts/upload-wheels.sh" @@ -48,7 +48,7 @@ steps: queue: cpu_queue_postmerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - label: "Build and publish TPU release image" diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh index a681f8927..75e3ef264 100644 --- a/.buildkite/scripts/upload-wheels.sh +++ b/.buildkite/scripts/upload-wheels.sh @@ -50,11 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" if [[ $normal_wheel == *"cu118"* ]]; then # if $normal_wheel matches cu118, do not upload the index.html echo "Skipping index files for cu118 wheels" -elif [[ $normal_wheel == *"cu121"* ]]; then - # if $normal_wheel matches cu121, do not upload the index.html - echo "Skipping index files for cu121 wheels" +elif [[ $normal_wheel == *"cu126"* ]]; then + # if $normal_wheel matches cu126, do not upload the index.html + echo "Skipping index files for cu126 wheels" else - # only upload index.html for cu124 wheels (default wheels) + # only upload index.html for cu128 wheels (default wheels) aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html" aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html" fi @@ -66,12 +66,12 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/" if [[ $normal_wheel == *"cu118"* ]]; then # if $normal_wheel matches cu118, do not upload the index.html echo "Skipping index files for cu118 wheels" -elif [[ $normal_wheel == *"cu121"* ]]; then - # if $normal_wheel matches cu121, do not upload the index.html - echo "Skipping index files for cu121 wheels" +elif [[ $normal_wheel == *"cu126"* ]]; then + # if $normal_wheel matches cu126, do not upload the index.html + echo "Skipping index files for cu126 wheels" else - # only upload index.html for cu124 wheels (default wheels) + # only upload index.html for cu128 wheels (default wheels) aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html" fi -aws s3 cp "$wheel" "s3://vllm-wheels/$version/" \ No newline at end of file +aws s3 cp "$wheel" "s3://vllm-wheels/$version/" diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index fc0eb3d9f..8da43322c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -455,7 +455,7 @@ steps: - tests/models/encoder_decoder/language commands: # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. - - pip install causal-conv1d + - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' - pytest -v -s models/decoder_only/language -m 'core_model or quant_model' - pytest -v -s models/embedding/language -m core_model diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml index 7b1d9f699..dd9b61a64 100644 --- a/.github/workflows/lint-and-deploy.yaml +++ b/.github/workflows/lint-and-deploy.yaml @@ -66,7 +66,7 @@ jobs: export AWS_SECRET_ACCESS_KEY=minioadmin sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" & helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" - + - name: curl test run: | kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 & @@ -79,4 +79,4 @@ jobs: "max_tokens": 7, "temperature": 0 }'):$CODE" - echo "$CODE" \ No newline at end of file + echo "$CODE" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 87681d7eb..90ed492d9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -46,7 +46,7 @@ repos: rev: 0.6.17 hooks: - id: pip-compile - args: [requirements/test.in, -o, requirements/test.txt] + args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match] files: ^requirements/test\.(in|txt)$ - repo: local hooks: diff --git a/CMakeLists.txt b/CMakeLists.txt index 3314f05fd..6be9adcb8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,8 +46,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1 # requirements.txt files and should be kept consistent. The ROCm torch # versions are derived from docker/Dockerfile.rocm # -set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0") -set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0") +set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0") +set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0") # # Try to find python package with an executable that exactly matches diff --git a/docker/Dockerfile b/docker/Dockerfile index 7d1fac9db..17adb7a92 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -5,11 +5,11 @@ # docs/source/contributing/dockerfile/dockerfile.md and # docs/source/assets/contributing/dockerfile-stages-dependency.png -ARG CUDA_VERSION=12.4.1 +ARG CUDA_VERSION=12.8.1 #################### BASE BUILD IMAGE #################### # prepare basic build environment FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base -ARG CUDA_VERSION=12.4.1 +ARG CUDA_VERSION=12.8.1 ARG PYTHON_VERSION=3.12 ARG TARGETPLATFORM ENV DEBIAN_FRONTEND=noninteractive @@ -37,6 +37,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # Reference: https://github.com/astral-sh/uv/pull/1694 ENV UV_HTTP_TIMEOUT=500 +ENV UV_INDEX_STRATEGY="unsafe-best-match" # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519 # as it was causing spam when compiling the CUTLASS kernels @@ -69,7 +70,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \ COPY requirements/common.txt requirements/common.txt COPY requirements/cuda.txt requirements/cuda.txt RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system -r requirements/cuda.txt + uv pip install --system -r requirements/cuda.txt \ + --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') # cuda arch list used by torch # can be useful for both `dev` and `test` @@ -92,9 +94,11 @@ COPY requirements/build.txt requirements/build.txt # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # Reference: https://github.com/astral-sh/uv/pull/1694 ENV UV_HTTP_TIMEOUT=500 +ENV UV_INDEX_STRATEGY="unsafe-best-match" RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system -r requirements/build.txt + uv pip install --system -r requirements/build.txt \ + --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') COPY . . ARG GIT_REPO_CHECK=0 @@ -161,22 +165,25 @@ FROM base as dev # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # Reference: https://github.com/astral-sh/uv/pull/1694 ENV UV_HTTP_TIMEOUT=500 +ENV UV_INDEX_STRATEGY="unsafe-best-match" + +# Workaround for #17068 +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4" COPY requirements/lint.txt requirements/lint.txt COPY requirements/test.txt requirements/test.txt COPY requirements/dev.txt requirements/dev.txt -# Workaround for #17068 -RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system mamba-ssm==2.2.4 --no-build-isolation RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system -r requirements/dev.txt + uv pip install --system -r requirements/dev.txt \ + --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') #################### DEV IMAGE #################### #################### vLLM installation IMAGE #################### # image with vLLM installed # TODO: Restore to base image after FlashInfer AOT wheel fixed FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base -ARG CUDA_VERSION=12.4.1 +ARG CUDA_VERSION=12.8.1 ARG PYTHON_VERSION=3.12 WORKDIR /vllm-workspace ENV DEBIAN_FRONTEND=noninteractive @@ -209,6 +216,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # Reference: https://github.com/astral-sh/uv/pull/1694 ENV UV_HTTP_TIMEOUT=500 +ENV UV_INDEX_STRATEGY="unsafe-best-match" # Workaround for https://github.com/openai/triton/issues/2507 and # https://github.com/pytorch/pytorch/issues/107960 -- hopefully @@ -229,7 +237,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # Install vllm wheel first, so that torch etc will be installed. RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system dist/*.whl --verbose + uv pip install --system dist/*.whl --verbose \ + --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') # If we need to build FlashInfer wheel before its release: # $ export FLASHINFER_ENABLE_AOT=1 @@ -246,19 +255,26 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist RUN --mount=type=cache,target=/root/.cache/uv \ . /etc/environment && \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ - uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \ + # TESTING: install FlashInfer from source to test 2.7.0 final RC + FLASHINFER_ENABLE_AOT=1 TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX' \ + uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.2.post1" ; \ fi COPY examples examples COPY benchmarks benchmarks COPY ./vllm/collect_env.py . +RUN --mount=type=cache,target=/root/.cache/uv \ +. /etc/environment && \ +uv pip list + # Although we build Flashinfer with AOT mode, there's still # some issues w.r.t. JIT compilation. Therefore we need to # install build dependencies for JIT compilation. # TODO: Remove this once FlashInfer AOT wheel is fixed COPY requirements/build.txt requirements/build.txt RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system -r requirements/build.txt + uv pip install --system -r requirements/build.txt \ + --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') #################### vLLM installation IMAGE #################### @@ -272,11 +288,13 @@ ADD . /vllm-workspace/ # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # Reference: https://github.com/astral-sh/uv/pull/1694 ENV UV_HTTP_TIMEOUT=500 +ENV UV_INDEX_STRATEGY="unsafe-best-match" -# install development dependencies (for testing) # Workaround for #17068 RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system mamba-ssm==2.2.4 --no-build-isolation + uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4" + +# install development dependencies (for testing) RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --system -r requirements/dev.txt diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/source/getting_started/installation/gpu/cuda.inc.md index 46bdb08eb..06915f09d 100644 --- a/docs/source/getting_started/installation/gpu/cuda.inc.md +++ b/docs/source/getting_started/installation/gpu/cuda.inc.md @@ -1,6 +1,6 @@ # Installation -vLLM contains pre-compiled C++ and CUDA (12.1) binaries. +vLLM contains pre-compiled C++ and CUDA (12.6) binaries. ## Requirements @@ -23,12 +23,12 @@ Therefore, it is recommended to install vLLM with a **fresh new** environment. I You can install vLLM using either `pip` or `uv pip`: ```console -# Install vLLM with CUDA 12.4. +# Install vLLM with CUDA 12.6. pip install vllm # If you are using pip. uv pip install vllm # If you are using uv. ``` -As of now, vLLM's binaries are compiled with CUDA 12.4 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.1, 11.8, and public PyTorch release versions: +As of now, vLLM's binaries are compiled with CUDA 12.6 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.8, 11.8, and public PyTorch release versions: ```console # Install vLLM with CUDA 11.8. diff --git a/examples/online_serving/chart-helm/values.yaml b/examples/online_serving/chart-helm/values.yaml index 9c48e7d06..28dba9a6f 100644 --- a/examples/online_serving/chart-helm/values.yaml +++ b/examples/online_serving/chart-helm/values.yaml @@ -8,7 +8,7 @@ image: # -- Image tag tag: "latest" # -- Container launch command - command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "bfloat16", "--host", "0.0.0.0", "--port", "8000"] + command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "float32", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"] # -- Container port containerPort: 8000 diff --git a/pyproject.toml b/pyproject.toml index b5f1039b4..c85e85b0c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "packaging", "setuptools>=61", "setuptools-scm>=8.0", - "torch == 2.6.0", + "torch == 2.7.0", "wheel", "jinja2", ] diff --git a/requirements/build.txt b/requirements/build.txt index 13d643bca..19d757b45 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -4,6 +4,6 @@ ninja packaging setuptools>=61 setuptools-scm>=8 -torch==2.6.0 +torch==2.7.0 wheel jinja2>=3.1.6 diff --git a/requirements/cpu.txt b/requirements/cpu.txt index 69f732c24..752931158 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -2,18 +2,19 @@ -r common.txt # Dependencies for CPUs -torch==2.6.0+cpu; platform_machine == "x86_64" -torch==2.6.0; platform_system == "Darwin" -torch==2.6.0; platform_machine == "ppc64le" or platform_machine == "aarch64" +--extra-index-url https://download.pytorch.org/whl/cpu +torch==2.7.0+cpu; platform_machine == "x86_64" +torch==2.7.0; platform_system == "Darwin" +torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64" torch==2.7.0.dev20250304; platform_machine == "s390x" # required for the image processor of minicpm-o-2_6, this must be updated alongside torch torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x" -torchaudio==2.6.0; platform_machine == "ppc64le" +torchaudio==2.7.0; platform_machine == "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch torchvision; platform_machine != "ppc64le" and platform_machine != "s390x" -torchvision==0.21.0; platform_machine == "ppc64le" +torchvision==0.22.0; platform_machine == "ppc64le" datasets # for benchmark scripts # cpu cannot use triton 3.3.0 diff --git a/requirements/cuda.txt b/requirements/cuda.txt index cdc6ee75a..a71d9728f 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -6,8 +6,9 @@ numba == 0.61.2; python_version > '3.9' # Dependencies for NVIDIA GPUs ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1. -torch==2.6.0 -torchaudio==2.6.0 +torch==2.7.0 +torchaudio==2.7.0 # These must be updated alongside torch -torchvision==0.21.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version -xformers==0.0.29.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.6.0 +torchvision==0.22.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version +# https://github.com/facebookresearch/xformers/releases/tag/v0.0.30 +xformers==0.0.30; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7 diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt index 05de4ff16..55ffe82e8 100644 --- a/requirements/rocm-build.txt +++ b/requirements/rocm-build.txt @@ -2,9 +2,9 @@ -r common.txt --extra-index-url https://download.pytorch.org/whl/rocm6.2.4 -torch==2.6.0 -torchvision==0.21.0 -torchaudio==2.6.0 +torch==2.7.0 +torchvision==0.22.0 +torchaudio==2.7.0 triton==3.2 cmake>=3.26,<4 diff --git a/requirements/test.in b/requirements/test.in index c5d2c4cd4..ee79aae58 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -23,9 +23,9 @@ sentence-transformers # required for embedding tests soundfile # required for audio tests jiwer # required for audio tests timm # required for internvl test -torch==2.6.0 -torchaudio==2.6.0 -torchvision==0.21.0 +torch==2.7.0 +torchaudio==2.7.0 +torchvision==0.22.0 transformers_stream_generator # required for qwen-vl test mamba_ssm # required for plamo2 test matplotlib # required for qwen-vl test diff --git a/requirements/test.txt b/requirements/test.txt index 9642a5bfe..2e8121e38 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,5 +1,5 @@ # This file was autogenerated by uv via the following command: -# uv pip compile requirements/test.in -o requirements/test.txt +# uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match absl-py==2.1.0 # via rouge-score accelerate==1.0.1 @@ -274,7 +274,7 @@ mamba-ssm==2.2.4 # via -r requirements/test.in markdown-it-py==3.0.0 # via rich -markupsafe==3.0.2 +markupsafe==3.0.1 # via # jinja2 # werkzeug @@ -355,39 +355,42 @@ numpy==1.26.4 # transformers # tritonclient # vocos -nvidia-cublas-cu12==12.4.5.8 +nvidia-cublas-cu12==12.6.4.1 # via # nvidia-cudnn-cu12 # nvidia-cusolver-cu12 # torch -nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-cupti-cu12==12.6.80 # via torch -nvidia-cuda-nvrtc-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.6.77 # via torch -nvidia-cuda-runtime-cu12==12.4.127 +nvidia-cuda-runtime-cu12==12.6.77 # via torch -nvidia-cudnn-cu12==9.1.0.70 +nvidia-cudnn-cu12==9.5.1.17 # via torch -nvidia-cufft-cu12==11.2.1.3 +nvidia-cufft-cu12==11.3.0.4 # via torch -nvidia-curand-cu12==10.3.5.147 +nvidia-cufile-cu12==1.11.1.6 # via torch -nvidia-cusolver-cu12==11.6.1.9 +nvidia-curand-cu12==10.3.7.77 # via torch -nvidia-cusparse-cu12==12.3.1.170 +nvidia-cusolver-cu12==11.7.1.2 + # via torch +nvidia-cusparse-cu12==12.5.4.2 # via # nvidia-cusolver-cu12 # torch -nvidia-cusparselt-cu12==0.6.2 +nvidia-cusparselt-cu12==0.6.3 # via torch -nvidia-nccl-cu12==2.21.5 +nvidia-nccl-cu12==2.26.2 # via torch -nvidia-nvjitlink-cu12==12.4.127 +nvidia-nvjitlink-cu12==12.6.85 # via + # nvidia-cufft-cu12 # nvidia-cusolver-cu12 # nvidia-cusparse-cu12 # torch -nvidia-nvtx-cu12==12.4.127 +nvidia-nvtx-cu12==12.6.77 # via torch opencv-python-headless==4.11.0.86 # via @@ -634,6 +637,7 @@ setuptools==75.8.0 # mamba-ssm # pytablewriter # torch + # triton shellingham==1.5.4 # via typer six==1.16.0 @@ -664,7 +668,7 @@ starlette-testclient==0.4.1 # via schemathesis statsmodels==0.14.4 # via genai-perf -sympy==1.13.1 +sympy==1.13.3 # via # einx # torch @@ -696,7 +700,7 @@ tomli==2.2.1 # via schemathesis tomli-w==1.2.0 # via schemathesis -torch==2.6.0 +torch==2.7.0 # via # -r requirements/test.in # accelerate @@ -714,12 +718,12 @@ torch==2.6.0 # torchvision # vector-quantize-pytorch # vocos -torchaudio==2.6.0 +torchaudio==2.7.0 # via # -r requirements/test.in # encodec # vocos -torchvision==0.21.0 +torchvision==0.22.0 # via # -r requirements/test.in # timm @@ -748,7 +752,7 @@ transformers==4.51.3 # transformers-stream-generator transformers-stream-generator==0.0.5 # via -r requirements/test.in -triton==3.2.0 +triton==3.3.0 # via torch tritonclient==2.51.0 # via diff --git a/setup.py b/setup.py index a1867960e..7675fbdf3 100755 --- a/setup.py +++ b/setup.py @@ -54,7 +54,7 @@ elif (sys.platform.startswith("linux") and torch.version.cuda is None # fallback to cpu VLLM_TARGET_DEVICE = "cpu" -MAIN_CUDA_VERSION = "12.4" +MAIN_CUDA_VERSION = "12.8" def is_sccache_available() -> bool: diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py index 6d96f5832..1702203b1 100644 --- a/vllm/attention/ops/ipex_attn.py +++ b/vllm/attention/ops/ipex_attn.py @@ -5,7 +5,8 @@ from typing import Dict, List, Optional, Tuple try: import intel_extension_for_pytorch.llm.modules as ipex_modules _use_ipex = True -except ImportError: +# AttributeError is to handle a bug in ipex https://github.com/intel/intel-extension-for-pytorch/pull/813 +except (ImportError, AttributeError): _use_ipex = False import torch -- GitLab From 13698db634b492151c6a5e04769119e3af29b265 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 30 Apr 2025 03:38:22 +0100 Subject: [PATCH 046/461] Improve configs - `ModelConfig` (#17130) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/conftest.py | 2 +- tests/engine/test_arg_utils.py | 13 +- .../test_register_quantization_config.py | 4 +- tests/test_config.py | 9 +- vllm/config.py | 513 +++++++++--------- vllm/engine/arg_utils.py | 451 +++++---------- vllm/entrypoints/llm.py | 18 +- .../layers/quantization/aqlm.py | 3 +- .../model_executor/layers/quantization/awq.py | 3 +- .../layers/quantization/awq_marlin.py | 7 +- .../layers/quantization/base_config.py | 13 +- .../layers/quantization/bitblas.py | 7 +- .../layers/quantization/bitsandbytes.py | 3 +- .../compressed_tensors/compressed_tensors.py | 3 +- .../layers/quantization/deepspeedfp.py | 5 +- .../layers/quantization/experts_int8.py | 3 +- .../layers/quantization/fbgemm_fp8.py | 3 +- .../model_executor/layers/quantization/fp8.py | 3 +- .../layers/quantization/gguf.py | 3 +- .../layers/quantization/gptq.py | 3 +- .../layers/quantization/gptq_bitblas.py | 7 +- .../layers/quantization/gptq_marlin.py | 7 +- .../layers/quantization/gptq_marlin_24.py | 7 +- .../layers/quantization/hqq_marlin.py | 3 +- .../layers/quantization/ipex_quant.py | 7 +- .../layers/quantization/marlin.py | 7 +- .../layers/quantization/modelopt.py | 7 +- .../layers/quantization/moe_wna16.py | 7 +- .../layers/quantization/neuron_quant.py | 3 +- .../layers/quantization/ptpc_fp8.py | 3 +- .../model_executor/layers/quantization/qqq.py | 3 +- .../layers/quantization/quark/quark.py | 3 +- .../layers/quantization/torchao.py | 3 +- .../layers/quantization/tpu_int8.py | 3 +- .../model_executor/layers/rotary_embedding.py | 2 +- vllm/model_executor/model_loader/neuron.py | 1 - 36 files changed, 492 insertions(+), 650 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 5fc09b241..f02b5a8c0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -738,7 +738,7 @@ class VllmRunner: - `block_size`: Set to `16` instead of `None` to reduce memory usage. - `enable_chunked_prefill`: Set to `False` instead of `None` for test reproducibility. - - `enforce_eager`: Set to `False` instead of `None` to test CUDA graph. + - `enforce_eager`: Set to `False` to test CUDA graph. """ def __init__( diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 2c8665802..16721ee9c 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -8,7 +8,7 @@ from typing import Literal, Optional import pytest -from vllm.config import PoolerConfig, config +from vllm.config import config from vllm.engine.arg_utils import (EngineArgs, contains_type, get_kwargs, get_type, is_not_builtin, is_type, literal_to_kwargs, nullable_kvs, @@ -222,17 +222,6 @@ def test_prefix_cache_default(): assert not engine_args.enable_prefix_caching -def test_valid_pooling_config(): - parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) - args = parser.parse_args([ - '--override-pooler-config', - '{"pooling_type": "MEAN"}', - ]) - engine_args = EngineArgs.from_cli_args(args=args) - assert engine_args.override_pooler_config == PoolerConfig( - pooling_type="MEAN", ) - - @pytest.mark.parametrize( ("arg"), [ diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py index abc1c05de..0ea71aaf8 100644 --- a/tests/quantization/test_register_quantization_config.py +++ b/tests/quantization/test_register_quantization_config.py @@ -14,7 +14,7 @@ import torch.nn.functional as F from vllm.model_executor.layers.linear import LinearBase # noqa: E501 from vllm.model_executor.layers.linear import UnquantizedLinearMethod from vllm.model_executor.layers.quantization import ( - get_quantization_config, register_quantization_config) + QuantizationMethods, get_quantization_config, register_quantization_config) from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501 QuantizationConfig) @@ -54,7 +54,7 @@ class CustomQuantConfig(QuantizationConfig): """Initialize the quantization config.""" self.num_bits = num_bits - def get_name(self) -> str: + def get_name(self) -> QuantizationMethods: """Name of the quantization method.""" return "custom_quant" diff --git a/tests/test_config.py b/tests/test_config.py index 2e5da8128..f2155d954 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -185,7 +185,7 @@ def test_get_pooling_config(): revision=None, ) - pooling_config = model_config._init_pooler_config(None) + pooling_config = model_config._init_pooler_config() assert pooling_config is not None assert pooling_config.normalize @@ -205,11 +205,12 @@ def test_get_pooling_config_from_args(): dtype="float16", revision=None) - override_config = PoolerConfig(pooling_type='CLS', normalize=True) + override_pooler_config = PoolerConfig(pooling_type='CLS', normalize=True) + model_config.override_pooler_config = override_pooler_config - pooling_config = model_config._init_pooler_config(override_config) + pooling_config = model_config._init_pooler_config() assert pooling_config is not None - assert asdict(pooling_config) == asdict(override_config) + assert asdict(pooling_config) == asdict(override_pooler_config) @pytest.mark.skipif(current_platform.is_rocm(), diff --git a/vllm/config.py b/vllm/config.py index abe59734e..f9c5e25a4 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -16,9 +16,8 @@ from dataclasses import (MISSING, dataclass, field, fields, is_dataclass, replace) from importlib.util import find_spec from pathlib import Path -from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Final, Literal, - Optional, Protocol, TypeVar, Union, cast, get_args, - get_origin) +from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Literal, Optional, + Protocol, TypeVar, Union, cast, get_args, get_origin) import torch from pydantic import BaseModel, Field, PrivateAttr @@ -211,103 +210,190 @@ def get_field(cls: ConfigType, name: str) -> Field: f"{cls.__name__}.{name} must have a default value or default factory.") -class ModelConfig: - """Configuration for the model. +TokenizerMode = Literal["auto", "slow", "mistral", "custom"] +ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] - Args: - model: Name or path of the huggingface model to use. - It is also used as the content for `model_name` tag in metrics - output when `served_model_name` is not specified. - task: The task to use the model for. Each vLLM instance only supports - one task, even if the same model can be used for multiple tasks. - When the model only supports one task, "auto" can be used to select - it; otherwise, you must specify explicitly which task to use. - tokenizer: Name or path of the huggingface tokenizer to use. - tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if - available, "slow" will always use the slow tokenizer, - "mistral" will always use the tokenizer from `mistral_common`, and - "custom" will use --tokenizer to select the preregistered tokenizer. - trust_remote_code: Trust remote code (e.g., from HuggingFace) when - downloading the model and tokenizer. - allowed_local_media_path: Allowing API requests to read local images or - videos from directories specified by the server file system. - This is a security risk. Should only be enabled in trusted - environments. - dtype: Data type for model weights and activations. The "auto" option - will use FP16 precision for FP32 and FP16 models, and BF16 precision - for BF16 models. - seed: Random seed for reproducibility. - revision: The specific model version to use. It can be a branch name, - a tag name, or a commit id. If unspecified, will use the default - version. - code_revision: The specific revision to use for the model code on - Hugging Face Hub. It can be a branch name, a tag name, or a - commit id. If unspecified, will use the default version. - tokenizer_revision: The specific tokenizer version to use. It can be a - branch name, a tag name, or a commit id. If unspecified, will use - the default version. - max_model_len: Maximum length of a sequence (including prompt and - output). If None, will be derived from the model. - spec_target_max_model_len: Specify the the maximum length for spec - decoding draft models. - quantization: Quantization method that was used to quantize the model - weights. If None, we assume the model weights are not quantized. - enforce_eager: Whether to enforce eager execution. If True, we will - disable CUDA graph and always execute the model in eager mode. - If False, we will use CUDA graph and eager execution in hybrid. - If None, the user did not specify, so default to False. - max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs. - When a sequence has context length larger than this, we fall back - to eager mode. Additionally for encoder-decoder models, if the - sequence length of the encoder input is larger than this, we fall - back to the eager mode. - max_logprobs: Maximum number of log probabilities. Defaults to 20. - disable_sliding_window: Whether to disable sliding window. If True, - we will disable the sliding window functionality of the model. - If the model does not support sliding window, this argument is - ignored. - skip_tokenizer_init: If true, skip initialization of tokenizer and - detokenizer. - served_model_name: The model name used in metrics tag `model_name`, - matches the model name exposed via the APIs. If multiple model - names provided, the first name will be used. If not specified, - the model name will be the same as `model`. - limit_mm_per_prompt: Maximum number of data items per modality - per prompt. Only applicable for multimodal models. - mm_processor_kwargs: Overrides for the multi-modal processor obtained - from `AutoProcessor.from_pretrained`. - disable_mm_preprocessor_cache: If True, disable caching of the - processed multi-modal inputs. - use_async_output_proc: Whether to use async output processor. - Defaults to True. - config_format: The config format which shall be loaded. - Defaults to 'auto' which defaults to 'hf'. - hf_token: The token to use as HTTP bearer authorization for remote files - . If `True`, will use the token generated when running - `huggingface-cli login` (stored in `~/.huggingface`). - hf_overrides: If a dictionary, contains arguments to be forwarded to the - HuggingFace config. If a callable, it is called to update the - HuggingFace config. - override_neuron_config: Initialize non default neuron config or - override default neuron config that are specific to Neuron devices, - this argument will be used to configure the neuron config that - can not be gathered from the vllm arguments. - override_pooler_config: Initialize non default pooling config or - override default pooling config for the pooling model. - logits_processor_pattern: Optional regex pattern specifying valid - logits processor qualified names that can be passed with the - `logits_processors` extra completion argument. Defaults to None, - which allows no processors. - generation_config: Configuration parameter file for generation. - model_impl: Which implementation of the model to use: - "auto" will try to use the vLLM implementation if it exists and - fall back to the Transformers implementation if no vLLM - implementation is available. - "vllm" will use the vLLM model implementation. - "transformers" will use the Transformers model implementation. - override_generation_config: Override the generation config with the - given config. - """ + +@config +@dataclass +class ModelConfig: + """Configuration for the model.""" + + model: str = "facebook/opt-125m" + """Name or path of the Hugging Face model to use. It is also used as the + content for `model_name` tag in metrics output when `served_model_name` is + not specified.""" + task: Literal[TaskOption, Literal["draft"]] = "auto" + """The task to use the model for. Each vLLM instance only supports one + task, even if the same model can be used for multiple tasks. When the model + only supports one task, "auto" can be used to select it; otherwise, you + must specify explicitly which task to use.""" + tokenizer: str = None # type: ignore + """Name or path of the Hugging Face tokenizer to use. If unspecified, model + name or path will be used.""" + tokenizer_mode: TokenizerMode = "auto" + """Tokenizer mode:\n + - "auto" will use the fast tokenizer if available.\n + - "slow" will always use the slow tokenizer.\n + - "mistral" will always use the tokenizer from `mistral_common`.\n + - "custom" will use --tokenizer to select the preregistered tokenizer.""" + trust_remote_code: bool = False + """Trust remote code (e.g., from HuggingFace) when downloading the model + and tokenizer.""" + dtype: Union[ModelDType, torch.dtype] = "auto" + """Data type for model weights and activations:\n + - "auto" will use FP16 precision for FP32 and FP16 models, and BF16 + precision for BF16 models.\n + - "half" for FP16. Recommended for AWQ quantization.\n + - "float16" is the same as "half".\n + - "bfloat16" for a balance between precision and range.\n + - "float" is shorthand for FP32 precision.\n + - "float32" for FP32 precision.""" + seed: Optional[int] = None + """Random seed for reproducibility.""" + hf_config_path: Optional[str] = None + """Name or path of the Hugging Face config to use. If unspecified, model + name or path will be used.""" + allowed_local_media_path: str = "" + """Allowing API requests to read local images or videos from directories + specified by the server file system. This is a security risk. Should only + be enabled in trusted environments.""" + revision: Optional[str] = None + """The specific model version to use. It can be a branch name, a tag name, + or a commit id. If unspecified, will use the default version.""" + code_revision: Optional[str] = None + """The specific revision to use for the model code on the Hugging Face Hub. + It can be a branch name, a tag name, or a commit id. If unspecified, will + use the default version.""" + rope_scaling: dict[str, Any] = field(default_factory=dict) + """RoPE scaling configuration in JSON format. For example, + `{"rope_type":"dynamic","factor":2.0}`.""" + rope_theta: Optional[float] = None + """RoPE theta. Use with `rope_scaling`. In some cases, changing the RoPE + theta improves the performance of the scaled model.""" + tokenizer_revision: Optional[str] = None + """The specific revision to use for the tokenizer on the Hugging Face Hub. + It can be a branch name, a tag name, or a commit id. If unspecified, will + use the default version.""" + max_model_len: int = None # type: ignore + """Model context length (prompt and output). If unspecified, will be + automatically derived from the model config. + + When passing via `--max-model-len`, supports k/m/g/K/M/G in human-readable + format. Examples:\n + - 1k -> 1000\n + - 1K -> 1024\n + - 25.6k -> 25,600""" + spec_target_max_model_len: Optional[int] = None + """Specify the the maximum length for spec decoding draft models.""" + quantization: Optional[QuantizationMethods] = None + """Method used to quantize the weights. If `None`, we first check the + `quantization_config` attribute in the model config file. If that is + `None`, we assume the model weights are not quantized and use `dtype` to + determine the data type of the weights.""" + enforce_eager: bool = False + """Whether to always use eager-mode PyTorch. If True, we will disable CUDA + graph and always execute the model in eager mode. If False, we will use + CUDA graph and eager execution in hybrid for maximal performance and + flexibility.""" + max_seq_len_to_capture: int = 8192 + """Maximum sequence len covered by CUDA graphs. When a sequence has context + length larger than this, we fall back to eager mode. Additionally for + encoder-decoder models, if the sequence length of the encoder input is + larger than this, we fall back to the eager mode.""" + max_logprobs: int = 20 + """Maximum number of log probabilities to return when `logprobs` is + specified in `SamplingParams`. The default value comes the default for the + OpenAI Chat Completions API.""" + disable_sliding_window: bool = False + """Whether to disable sliding window. If True, we will disable the sliding + window functionality of the model, capping to sliding window size. If the + model does not support sliding window, this argument is ignored.""" + disable_cascade_attn: bool = False + """Disable cascade attention for V1. While cascade attention does not + change the mathematical correctness, disabling it could be useful for + preventing potential numerical issues. Note that even if this is set to + False, cascade attention will be only used when the heuristic tells that + it's beneficial.""" + skip_tokenizer_init: bool = False + """Skip initialization of tokenizer and detokenizer. Expects valid + `prompt_token_ids` and `None` for prompt from the input. The generated + output will contain token ids.""" + served_model_name: Optional[Union[str, list[str]]] = None + """The model name(s) used in the API. If multiple names are provided, the + server will respond to any of the provided names. The model name in the + model field of a response will be the first name in this list. If not + specified, the model name will be the same as the `--model` argument. Noted + that this name(s) will also be used in `model_name` tag content of + prometheus metrics, if multiple names provided, metrics tag will take the + first one.""" + limit_mm_per_prompt: dict[str, int] = field(default_factory=dict) + """Maximum number of data items per modality per prompt. Only applicable + for multimodal models.""" + use_async_output_proc: bool = True + """Whether to use async output processor.""" + config_format: Union[str, ConfigFormat] = ConfigFormat.AUTO.value + """The format of the model config to load:\n + - "auto" will try to load the config in hf format if available else it + will try to load in mistral format.\n + - "hf" will load the config in hf format.\n + - "mistral" will load the config in mistral format.""" + hf_token: Optional[Union[bool, str]] = None + """The token to use as HTTP bearer authorization for remote files . If + `True`, will use the token generated when running `huggingface-cli login` + (stored in `~/.huggingface`).""" + hf_overrides: HfOverrides = field(default_factory=dict) + """If a dictionary, contains arguments to be forwarded to the Hugging Face + config. If a callable, it is called to update the HuggingFace config. When + specified via CLI, the argument must be a valid JSON string.""" + mm_processor_kwargs: Optional[dict[str, Any]] = None + """Arguments to be forwarded to the model's processor for multi-modal data, + e.g., image processor. Overrides for the multi-modal processor obtained + from `AutoProcessor.from_pretrained`. The available overrides depend on the + model that is being run. For example, for Phi-3-Vision: `{"num_crops": 4}`. + When specified via CLI, the argument must be a valid JSON string.""" + disable_mm_preprocessor_cache: bool = False + """If `True`, disable caching of the multi-modal preprocessor/mapper (not + recommended).""" + override_neuron_config: dict[str, Any] = field(default_factory=dict) + """Initialize non-default neuron config or override default neuron config + that are specific to Neuron devices, this argument will be used to + configure the neuron config that can not be gathered from the vllm + arguments. e.g. `{"cast_logits_dtype": "bloat16"}`. When specified via CLI, + the argument must be a valid JSON string.""" + pooler_config: Optional["PoolerConfig"] = field(init=False) + """Pooler config which controls the behaviour of output pooling in pooling + models.""" + override_pooler_config: Optional[Union[dict, "PoolerConfig"]] = None + """Initialize non-default pooling config or override default pooling config + for the pooling model. e.g. `{"pooling_type": "mean", "normalize": false}`. + When specified via CLI, the argument must be a valid JSON string.""" + logits_processor_pattern: Optional[str] = None + """Optional regex pattern specifying valid logits processor qualified names + that can be passed with the `logits_processors` extra completion argument. + Defaults to `None`, which allows no processors.""" + generation_config: str = "auto" + """The folder path to the generation config. Defaults to `"auto"`, the + generation config will be loaded from model path. If set to `"vllm"`, no + generation config is loaded, vLLM defaults will be used. If set to a folder + path, the generation config will be loaded from the specified folder path. + If `max_new_tokens` is specified in generation config, then it sets a + server-wide limit on the number of output tokens for all requests.""" + override_generation_config: dict[str, Any] = field(default_factory=dict) + """Overrides or sets generation config. e.g. `{"temperature": 0.5}`. If + used with `--generation-config auto`, the override parameters will be + merged with the default config from the model. If used with + `--generation-config vllm`, only the override parameters are used. + When specified via CLI, the argument must be a valid JSON string.""" + enable_sleep_mode: bool = False + """Enable sleep mode for the engine (only cuda platform is supported).""" + model_impl: Union[str, ModelImpl] = ModelImpl.AUTO.value + """Which implementation of the model to use:\n + - "auto" will try to use the vLLM implementation, if it exists, and fall + back to the Transformers implementation if no vLLM implementation is + available.\n + - "vllm" will use the vLLM model implementation.\n + - "transformers" will use the Transformers model implementation.""" def compute_hash(self) -> str: """ @@ -342,92 +428,43 @@ class ModelConfig: assert_hashable(str_factors) return hashlib.sha256(str(factors).encode()).hexdigest() - def __init__( - self, - model: str, - task: Literal[TaskOption, Literal["draft"]], - tokenizer: str, - tokenizer_mode: str, - trust_remote_code: bool, - dtype: Union[str, torch.dtype], - seed: int, - hf_config_path: Optional[str] = None, - allowed_local_media_path: str = "", - revision: Optional[str] = None, - code_revision: Optional[str] = None, - rope_scaling: Optional[dict[str, Any]] = None, - rope_theta: Optional[float] = None, - tokenizer_revision: Optional[str] = None, - max_model_len: Optional[int] = None, - spec_target_max_model_len: Optional[int] = None, - quantization: Optional[str] = None, - enforce_eager: Optional[bool] = None, - max_seq_len_to_capture: Optional[int] = None, - max_logprobs: int = 20, - disable_sliding_window: bool = False, - disable_cascade_attn: bool = False, - skip_tokenizer_init: bool = False, - served_model_name: Optional[Union[str, list[str]]] = None, - limit_mm_per_prompt: Optional[dict[str, int]] = None, - mm_processor_kwargs: Optional[dict[str, Any]] = None, - disable_mm_preprocessor_cache: bool = False, - use_async_output_proc: bool = True, - config_format: ConfigFormat = ConfigFormat.AUTO, - hf_token: Optional[Union[bool, str]] = None, - hf_overrides: Optional[HfOverrides] = None, - override_neuron_config: Optional[dict[str, Any]] = None, - override_pooler_config: Optional["PoolerConfig"] = None, - logits_processor_pattern: Optional[str] = None, - generation_config: str = "auto", - enable_sleep_mode: bool = False, - override_generation_config: Optional[dict[str, Any]] = None, - model_impl: Union[str, ModelImpl] = ModelImpl.AUTO, - ) -> None: - self.model = maybe_model_redirect(model) - self.tokenizer = maybe_model_redirect(tokenizer) - - self.hf_config_path = hf_config_path - if isinstance(hf_config_path, str): - self.hf_config_path = maybe_model_redirect(hf_config_path) - - self.tokenizer_mode = tokenizer_mode - self.trust_remote_code = trust_remote_code - self.allowed_local_media_path = allowed_local_media_path - self.seed = seed - self.revision = revision - self.code_revision = code_revision - self.rope_scaling = rope_scaling - self.rope_theta = rope_theta - self.model_impl = model_impl - - if hf_overrides is None: - hf_overrides = {} - - if callable(hf_overrides): + def __post_init__(self) -> None: + self.model = maybe_model_redirect(self.model) + # The tokenizer is consistent with the model by default. + if self.tokenizer is None: + self.tokenizer = self.model + if self.tokenizer_revision is None: + self.tokenizer_revision = self.revision + self.tokenizer = maybe_model_redirect(self.tokenizer) + + if isinstance(self.hf_config_path, str): + self.hf_config_path = maybe_model_redirect(self.hf_config_path) + + if callable(self.hf_overrides): hf_overrides_kw = {} - hf_overrides_fn = hf_overrides + hf_overrides_fn = self.hf_overrides else: - hf_overrides_kw = hf_overrides + hf_overrides_kw = self.hf_overrides hf_overrides_fn = None - if rope_scaling is not None: - hf_override: dict[str, Any] = {"rope_scaling": rope_scaling} + if self.rope_scaling: + hf_override: dict[str, Any] = {"rope_scaling": self.rope_scaling} hf_overrides_kw.update(hf_override) - hf_overrides_str = json.dumps(hf_overrides) + hf_overrides_str = json.dumps(hf_overrides_kw) msg = ( "`--rope-scaling` will be removed in a future release. " f"'Please instead use `--hf-overrides '{hf_overrides_str}'`") warnings.warn(DeprecationWarning(msg), stacklevel=2) - if rope_theta is not None: - hf_override = {"rope_theta": rope_theta} + if self.rope_theta is not None: + hf_override = {"rope_theta": self.rope_theta} hf_overrides_kw.update(hf_override) - hf_overrides_str = json.dumps(hf_overrides) + hf_overrides_str = json.dumps(hf_overrides_kw) msg = ( "`--rope-theta` will be removed in a future release. " f"'Please instead use `--hf-overrides '{hf_overrides_str}'`") warnings.warn(DeprecationWarning(msg), stacklevel=2) - self.maybe_pull_model_tokenizer_for_s3(model, tokenizer) + self.maybe_pull_model_tokenizer_for_s3(self.model, self.tokenizer) if (backend := envs.VLLM_ATTENTION_BACKEND ) and backend == "FLASHINFER" and find_spec("flashinfer") is None: @@ -437,20 +474,6 @@ class ModelConfig: "https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile " # noqa: E501 "for instructions on how to install it.") - # The tokenizer version is consistent with the model version by default. - if tokenizer_revision is None: - self.tokenizer_revision = revision - else: - self.tokenizer_revision = tokenizer_revision - self.quantization = quantization - self.enforce_eager = enforce_eager - self.max_seq_len_to_capture = max_seq_len_to_capture - self.max_logprobs = max_logprobs - self.disable_sliding_window = disable_sliding_window - self.disable_cascade_attn = disable_cascade_attn - self.skip_tokenizer_init = skip_tokenizer_init - self.enable_sleep_mode = enable_sleep_mode - from vllm.platforms import current_platform if (self.enable_sleep_mode @@ -458,9 +481,12 @@ class ModelConfig: raise ValueError( "Sleep mode is not supported on current platform.") + if isinstance(self.config_format, str): + self.config_format = ConfigFormat(self.config_format) + hf_config = get_config(self.hf_config_path or self.model, - trust_remote_code, revision, code_revision, - config_format) + self.trust_remote_code, self.revision, + self.code_revision, self.config_format) if hf_overrides_kw: logger.info("Overriding HF config with %s", hf_overrides_kw) @@ -476,13 +502,8 @@ class ModelConfig: "attention_chunk_size", None) self.encoder_config = self._get_encoder_config() self.hf_image_processor_config = get_hf_image_processor_config( - self.model, hf_token=hf_token, revision=revision) - self.dtype = _get_and_verify_dtype(self.hf_config, dtype) - self.use_async_output_proc = use_async_output_proc - - # Set enforce_eager to False if the value is unset. - if self.enforce_eager is None: - self.enforce_eager = False + self.model, hf_token=self.hf_token, revision=self.revision) + self.dtype = _get_and_verify_dtype(self.hf_config, self.dtype) interleaved_attn_models = ["gemma2", "gemma3_text", "cohere2"] sliding_window = getattr(self.hf_text_config, "sliding_window", None) @@ -515,18 +536,14 @@ class ModelConfig: self.max_model_len = _get_and_verify_max_len( hf_config=self.hf_text_config, - max_model_len=max_model_len, + max_model_len=self.max_model_len, disable_sliding_window=self.disable_sliding_window, sliding_window_len=self.get_hf_config_sliding_window(), - spec_target_max_model_len=spec_target_max_model_len, + spec_target_max_model_len=self.spec_target_max_model_len, encoder_config=self.encoder_config) - self.served_model_name = get_served_model_name(model, - served_model_name) - self.multimodal_config = self._init_multimodal_config( - limit_mm_per_prompt=limit_mm_per_prompt, - mm_processor_kwargs=mm_processor_kwargs, - disable_mm_preprocessor_cache=disable_mm_preprocessor_cache, - ) + self.served_model_name = get_served_model_name(self.model, + self.served_model_name) + self.multimodal_config = self._init_multimodal_config() if not self.skip_tokenizer_init: self._verify_tokenizer_mode() @@ -535,24 +552,19 @@ class ModelConfig: self.has_noops = self._init_has_noops() self.has_inner_state = self._init_has_inner_state() - if current_platform.is_neuron(): - self.override_neuron_config = override_neuron_config - else: - self.override_neuron_config = None + if (not current_platform.is_neuron() and self.override_neuron_config): + raise ValueError( + "`override_neuron_config` is only supported on Neuron.") - supported_tasks, task = self._resolve_task(task) + supported_tasks, task = self._resolve_task(self.task) self.supported_tasks = supported_tasks - self.task: Final = task + self.task = task if self.task in ("draft", "generate"): self.truncation_side = "left" else: self.truncation_side = "right" - self.pooler_config = self._init_pooler_config(override_pooler_config) - self.logits_processor_pattern = logits_processor_pattern - - self.generation_config = generation_config - self.override_generation_config = override_generation_config or {} + self.pooler_config = self._init_pooler_config() self._verify_quantization() self._verify_cuda_graph() @@ -591,26 +603,21 @@ class ModelConfig: model, ignore_pattern=["*.pt", "*.safetensors", "*.bin"]) self.tokenizer = s3_tokenizer.dir - def _init_multimodal_config( - self, - limit_mm_per_prompt: Optional[dict[str, int]], - mm_processor_kwargs: Optional[dict[str, Any]], - disable_mm_preprocessor_cache: bool, - ) -> Optional["MultiModalConfig"]: + def _init_multimodal_config(self) -> Optional["MultiModalConfig"]: if self.registry.is_multimodal_model(self.architectures): return MultiModalConfig( - limit_per_prompt=limit_mm_per_prompt or {}, - mm_processor_kwargs=mm_processor_kwargs or {}, - disable_mm_preprocessor_cache=disable_mm_preprocessor_cache, - ) + limit_per_prompt=self.limit_mm_per_prompt, + mm_processor_kwargs=self.mm_processor_kwargs, + disable_mm_preprocessor_cache=self. + disable_mm_preprocessor_cache) - if limit_mm_per_prompt: + if self.limit_mm_per_prompt: raise ValueError("`limit_mm_per_prompt` is only supported for " "multimodal models.") - if mm_processor_kwargs: + if self.mm_processor_kwargs: raise ValueError("`mm_processor_kwargs` is only supported for " "multimodal models.") - if disable_mm_preprocessor_cache: + if self.disable_mm_preprocessor_cache: raise ValueError("`disable_mm_preprocessor_cache` is only " "supported for multimodal models.") @@ -620,31 +627,32 @@ class ModelConfig: return get_sentence_transformer_tokenizer_config( self.model, self.revision) - def _init_pooler_config( - self, - override_pooler_config: Optional["PoolerConfig"], - ) -> Optional["PoolerConfig"]: + def _init_pooler_config(self) -> Optional["PoolerConfig"]: if self.runner_type == "pooling": - user_config = override_pooler_config or PoolerConfig() + if isinstance(self.override_pooler_config, dict): + self.override_pooler_config = PoolerConfig( + **self.override_pooler_config) + + pooler_config = self.override_pooler_config or PoolerConfig() base_config = get_pooling_config(self.model, self.revision) if base_config is not None: # Only set values that are not overridden by the user for k, v in base_config.items(): - if getattr(user_config, k) is None: - setattr(user_config, k, v) + if getattr(pooler_config, k) is None: + setattr(pooler_config, k, v) if self.is_matryoshka: - if user_config.normalize is None: - user_config.normalize = True - elif not user_config.normalize: + if pooler_config.normalize is None: + pooler_config.normalize = True + elif not pooler_config.normalize: raise ValueError( "`normalize` must be enabled (set to True) " "for models that are compatible with " "Matryoshka Representation.") - return user_config + return pooler_config return None @@ -662,11 +670,11 @@ class ModelConfig: return self.registry.model_has_inner_state(self.architectures) def _verify_tokenizer_mode(self) -> None: - tokenizer_mode = self.tokenizer_mode.lower() - if tokenizer_mode not in ["auto", "slow", "mistral", "custom"]: + tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower()) + if tokenizer_mode not in get_args(TokenizerMode): raise ValueError( f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be " - "either 'auto', 'slow', 'mistral' or 'custom'.") + f"one of {get_args(TokenizerMode)}.") self.tokenizer_mode = tokenizer_mode def _get_preferred_task( @@ -781,7 +789,8 @@ class ModelConfig: "quark", "nvfp4", "bitblas", "gptq_bitblas" ] if self.quantization is not None: - self.quantization = self.quantization.lower() + self.quantization = cast(QuantizationMethods, + self.quantization.lower()) # Parse quantization method from the HF model config, if available. quant_cfg = self._parse_quant_hf_config() @@ -857,8 +866,6 @@ class ModelConfig: "non-quantized models.", self.quantization) def _verify_cuda_graph(self) -> None: - if self.max_seq_len_to_capture is None: - self.max_seq_len_to_capture = self.max_model_len self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, self.max_model_len) ROCM_UNSUPPORTED_MODELS = ['mllama'] @@ -1294,7 +1301,7 @@ class ModelConfig: @property def runner_type(self) -> RunnerType: - return _TASK_RUNNER[self.task] + return _TASK_RUNNER[cast(_ResolvedTask, self.task)] @property def is_v1_compatible(self) -> bool: @@ -2201,7 +2208,7 @@ class SpeculativeConfig: according to the log probability settings in SamplingParams.""" # Draft model configuration - quantization: Optional[str] = None + quantization: Optional[QuantizationMethods] = None """Quantization method that was used to quantize the draft model weights. If `None`, we assume the model weights are not quantized. Note that it only takes effect when using the draft model-based speculative method.""" @@ -2386,7 +2393,6 @@ class SpeculativeConfig: code_revision=self.code_revision, tokenizer_revision=self.target_model_config. tokenizer_revision, - max_model_len=None, spec_target_max_model_len=self.target_model_config. max_model_len, quantization=self.quantization, @@ -2793,30 +2799,31 @@ class PromptAdapterConfig: class MultiModalConfig: """Controls the behavior of multimodal models.""" - limit_per_prompt: dict[str, int] = field(default_factory=dict) + limit_per_prompt: dict[str, int] = get_field(ModelConfig, + "limit_mm_per_prompt") """ The maximum number of input items allowed per prompt for each modality. This should be a JSON string that will be parsed into a dictionary. Defaults to 1 (V0) or 999 (V1) for each modality. For example, to allow up to 16 images and 2 videos per prompt: - :code:`{"images": 16, "videos": 2}` + `{"images": 16, "videos": 2}` """ mm_processor_kwargs: Optional[dict[str, object]] = None """ Overrides for the multi-modal processor obtained from - :meth:`transformers.AutoProcessor.from_pretrained`. + `transformers.AutoProcessor.from_pretrained`. The available overrides depend on the model that is being run. For example, for Phi-3-Vision: - :code:`{"num_crops": 4}`. + `{"num_crops": 4}`. """ disable_mm_preprocessor_cache: bool = False """ - If :code:`True`, disable caching of the processed multi-modal inputs. + If `True`, disable caching of the processed multi-modal inputs. """ def compute_hash(self) -> str: @@ -2907,10 +2914,6 @@ class PoolerConfig: usedforsecurity=False).hexdigest() return hash_str - @staticmethod - def from_json(json_str: str) -> "PoolerConfig": - return PoolerConfig(**json.loads(json_str)) - _STR_DTYPE_TO_TORCH_DTYPE = { "half": torch.float16, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index be0cd4d3a..4f074fcd1 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -20,15 +20,16 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, DeviceConfig, DistributedExecutorBackend, GuidedDecodingBackend, GuidedDecodingBackendV1, HfOverrides, KVTransferConfig, LoadConfig, LoadFormat, - LoRAConfig, ModelConfig, ModelImpl, MultiModalConfig, - ObservabilityConfig, ParallelConfig, PoolerConfig, - PrefixCachingHashAlgo, PromptAdapterConfig, - SchedulerConfig, SchedulerPolicy, SpeculativeConfig, - TaskOption, TokenizerPoolConfig, VllmConfig, - get_attr_docs, get_field) + LoRAConfig, ModelConfig, ModelDType, ModelImpl, + MultiModalConfig, ObservabilityConfig, ParallelConfig, + PoolerConfig, PrefixCachingHashAlgo, + PromptAdapterConfig, SchedulerConfig, SchedulerPolicy, + SpeculativeConfig, TaskOption, TokenizerMode, + TokenizerPoolConfig, VllmConfig, get_attr_docs, + get_field) from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.plugins import load_general_plugins from vllm.reasoning import ReasoningParserManager from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3 @@ -183,6 +184,9 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]: kwargs[name]["nargs"] = "+" elif contains_type(type_hints, int): kwargs[name]["type"] = int + # Special case for large integers + if name in {"max_model_len"}: + kwargs[name]["type"] = human_readable_int elif contains_type(type_hints, float): kwargs[name]["type"] = float elif contains_type(type_hints, dict): @@ -212,22 +216,23 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]: @dataclass class EngineArgs: """Arguments for vLLM engine.""" - model: str = 'facebook/opt-125m' - served_model_name: Optional[Union[str, List[str]]] = None - tokenizer: Optional[str] = None - hf_config_path: Optional[str] = None - task: TaskOption = "auto" - skip_tokenizer_init: bool = False - tokenizer_mode: str = 'auto' - trust_remote_code: bool = False - allowed_local_media_path: str = "" + model: str = ModelConfig.model + served_model_name: Optional[Union[ + str, List[str]]] = ModelConfig.served_model_name + tokenizer: Optional[str] = ModelConfig.tokenizer + hf_config_path: Optional[str] = ModelConfig.hf_config_path + task: TaskOption = ModelConfig.task + skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init + tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode + trust_remote_code: bool = ModelConfig.trust_remote_code + allowed_local_media_path: str = ModelConfig.allowed_local_media_path download_dir: Optional[str] = LoadConfig.download_dir load_format: str = LoadConfig.load_format - config_format: ConfigFormat = ConfigFormat.AUTO - dtype: str = 'auto' + config_format: str = ModelConfig.config_format + dtype: ModelDType = ModelConfig.dtype kv_cache_dtype: CacheDType = CacheConfig.cache_dtype - seed: Optional[int] = None - max_model_len: Optional[int] = None + seed: Optional[int] = ModelConfig.seed + max_model_len: Optional[int] = ModelConfig.max_model_len # Note: Specifying a custom executor backend by passing a class # is intended for expert use only. The API may change without # notice. @@ -245,8 +250,8 @@ class EngineArgs: enable_prefix_caching: Optional[bool] = CacheConfig.enable_prefix_caching prefix_caching_hash_algo: PrefixCachingHashAlgo = \ CacheConfig.prefix_caching_hash_algo - disable_sliding_window: bool = False - disable_cascade_attn: bool = False + disable_sliding_window: bool = ModelConfig.disable_sliding_window + disable_cascade_attn: bool = ModelConfig.disable_cascade_attn use_v2_block_manager: bool = True swap_space: float = CacheConfig.swap_space cpu_offload_gb: float = CacheConfig.cpu_offload_gb @@ -258,18 +263,19 @@ class EngineArgs: long_prefill_token_threshold: int = \ SchedulerConfig.long_prefill_token_threshold max_num_seqs: Optional[int] = SchedulerConfig.max_num_seqs - max_logprobs: int = 20 # Default value for OpenAI Chat Completions API + max_logprobs: int = ModelConfig.max_logprobs disable_log_stats: bool = False - revision: Optional[str] = None - code_revision: Optional[str] = None - rope_scaling: Optional[Dict[str, Any]] = None - rope_theta: Optional[float] = None - hf_token: Optional[Union[bool, str]] = None - hf_overrides: Optional[HfOverrides] = None - tokenizer_revision: Optional[str] = None - quantization: Optional[str] = None - enforce_eager: Optional[bool] = None - max_seq_len_to_capture: int = 8192 + revision: Optional[str] = ModelConfig.revision + code_revision: Optional[str] = ModelConfig.code_revision + rope_scaling: dict[str, Any] = get_field(ModelConfig, "rope_scaling") + rope_theta: Optional[float] = ModelConfig.rope_theta + hf_token: Optional[Union[bool, str]] = ModelConfig.hf_token + hf_overrides: Optional[HfOverrides] = \ + get_field(ModelConfig, "hf_overrides") + tokenizer_revision: Optional[str] = ModelConfig.tokenizer_revision + quantization: Optional[QuantizationMethods] = ModelConfig.quantization + enforce_eager: bool = ModelConfig.enforce_eager + max_seq_len_to_capture: int = ModelConfig.max_seq_len_to_capture disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce # The following three fields are deprecated and will be removed in a future # release. Setting them will have no effect. Please remove them from your @@ -280,8 +286,10 @@ class EngineArgs: get_field(TokenizerPoolConfig, "extra_config") limit_mm_per_prompt: dict[str, int] = \ get_field(MultiModalConfig, "limit_per_prompt") - mm_processor_kwargs: Optional[Dict[str, Any]] = None - disable_mm_preprocessor_cache: bool = False + mm_processor_kwargs: Optional[Dict[str, Any]] = \ + MultiModalConfig.mm_processor_kwargs + disable_mm_preprocessor_cache: bool = \ + MultiModalConfig.disable_mm_preprocessor_cache # LoRA fields enable_lora: bool = False enable_lora_bias: bool = LoRAConfig.bias_enabled @@ -323,7 +331,8 @@ class EngineArgs: DecodingConfig.disable_any_whitespace guided_decoding_disable_additional_properties: bool = \ DecodingConfig.disable_additional_properties - logits_processor_pattern: Optional[str] = None + logits_processor_pattern: Optional[ + str] = ModelConfig.logits_processor_pattern speculative_config: Optional[Dict[str, Any]] = None @@ -331,22 +340,25 @@ class EngineArgs: show_hidden_metrics_for_version: Optional[str] = None otlp_traces_endpoint: Optional[str] = None collect_detailed_traces: Optional[str] = None - disable_async_output_proc: bool = False + disable_async_output_proc: bool = not ModelConfig.use_async_output_proc scheduling_policy: SchedulerPolicy = SchedulerConfig.policy scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls - override_neuron_config: Optional[Dict[str, Any]] = None - override_pooler_config: Optional[PoolerConfig] = None + override_neuron_config: dict[str, Any] = \ + get_field(ModelConfig, "override_neuron_config") + override_pooler_config: Optional[Union[dict, PoolerConfig]] = \ + ModelConfig.override_pooler_config compilation_config: Optional[CompilationConfig] = None worker_cls: str = ParallelConfig.worker_cls worker_extension_cls: str = ParallelConfig.worker_extension_cls kv_transfer_config: Optional[KVTransferConfig] = None - generation_config: Optional[str] = "auto" - override_generation_config: Optional[Dict[str, Any]] = None - enable_sleep_mode: bool = False - model_impl: str = "auto" + generation_config: str = ModelConfig.generation_config + enable_sleep_mode: bool = ModelConfig.enable_sleep_mode + override_generation_config: dict[str, Any] = \ + get_field(ModelConfig, "override_generation_config") + model_impl: str = ModelConfig.model_impl calculate_kv_scales: bool = CacheConfig.calculate_kv_scales @@ -356,9 +368,6 @@ class EngineArgs: use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load def __post_init__(self): - if not self.tokenizer: - self.tokenizer = self.model - # support `EngineArgs(compilation_config={...})` # without having to manually construct a # CompilationConfig object @@ -375,80 +384,87 @@ class EngineArgs: """Shared CLI arguments for vLLM engine.""" # Model arguments - parser.add_argument( - '--model', - type=str, - default=EngineArgs.model, - help='Name or path of the huggingface model to use.') - parser.add_argument( - '--task', - default=EngineArgs.task, - choices=get_args(TaskOption), - help='The task to use the model for. Each vLLM instance only ' - 'supports one task, even if the same model can be used for ' - 'multiple tasks. When the model only supports one task, ``"auto"`` ' - 'can be used to select it; otherwise, you must specify explicitly ' - 'which task to use.') - parser.add_argument( - '--tokenizer', - type=optional_type(str), - default=EngineArgs.tokenizer, - help='Name or path of the huggingface tokenizer to use. ' - 'If unspecified, model name or path will be used.') - parser.add_argument( - "--hf-config-path", - type=optional_type(str), - default=EngineArgs.hf_config_path, - help='Name or path of the huggingface config to use. ' - 'If unspecified, model name or path will be used.') - parser.add_argument( - '--skip-tokenizer-init', - action='store_true', - help='Skip initialization of tokenizer and detokenizer. ' - 'Expects valid prompt_token_ids and None for prompt from ' - 'the input. The generated output will contain token ids.') - parser.add_argument( - '--revision', - type=optional_type(str), - default=None, - help='The specific model version to use. It can be a branch ' - 'name, a tag name, or a commit id. If unspecified, will use ' - 'the default version.') - parser.add_argument( - '--code-revision', - type=optional_type(str), - default=None, - help='The specific revision to use for the model code on ' - 'Hugging Face Hub. It can be a branch name, a tag name, or a ' - 'commit id. If unspecified, will use the default version.') - parser.add_argument( - '--tokenizer-revision', - type=optional_type(str), - default=None, - help='Revision of the huggingface tokenizer to use. ' - 'It can be a branch name, a tag name, or a commit id. ' - 'If unspecified, will use the default version.') - parser.add_argument( - '--tokenizer-mode', - type=str, - default=EngineArgs.tokenizer_mode, - choices=['auto', 'slow', 'mistral', 'custom'], - help='The tokenizer mode.\n\n* "auto" will use the ' - 'fast tokenizer if available.\n* "slow" will ' - 'always use the slow tokenizer. \n* ' - '"mistral" will always use the `mistral_common` tokenizer. \n* ' - '"custom" will use --tokenizer to select the ' - 'preregistered tokenizer.') - parser.add_argument('--trust-remote-code', - action='store_true', - help='Trust remote code from huggingface.') - parser.add_argument( - '--allowed-local-media-path', - type=str, - help="Allowing API requests to read local images or videos " - "from directories specified by the server file system. " - "This is a security risk. " - "Should only be enabled in trusted environments.") + model_kwargs = get_kwargs(ModelConfig) + model_group = parser.add_argument_group( + title="ModelConfig", + description=ModelConfig.__doc__, + ) + model_group.add_argument("--model", **model_kwargs["model"]) + model_group.add_argument("--task", **model_kwargs["task"]) + model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"]) + model_group.add_argument("--tokenizer-mode", + **model_kwargs["tokenizer_mode"]) + model_group.add_argument("--trust-remote-code", + **model_kwargs["trust_remote_code"]) + model_group.add_argument("--dtype", **model_kwargs["dtype"]) + model_group.add_argument("--seed", **model_kwargs["seed"]) + model_group.add_argument("--hf-config-path", + **model_kwargs["hf_config_path"]) + model_group.add_argument("--allowed-local-media-path", + **model_kwargs["allowed_local_media_path"]) + model_group.add_argument("--revision", **model_kwargs["revision"]) + model_group.add_argument("--code-revision", + **model_kwargs["code_revision"]) + model_group.add_argument("--rope-scaling", + **model_kwargs["rope_scaling"]) + model_group.add_argument("--rope-theta", **model_kwargs["rope_theta"]) + model_group.add_argument("--tokenizer-revision", + **model_kwargs["tokenizer_revision"]) + model_group.add_argument("--max-model-len", + **model_kwargs["max_model_len"]) + model_group.add_argument("--quantization", "-q", + **model_kwargs["quantization"]) + model_group.add_argument("--enforce-eager", + **model_kwargs["enforce_eager"]) + model_group.add_argument("--max-seq-len-to-capture", + **model_kwargs["max_seq_len_to_capture"]) + model_group.add_argument("--max-logprobs", + **model_kwargs["max_logprobs"]) + model_group.add_argument("--disable-sliding-window", + **model_kwargs["disable_sliding_window"]) + model_group.add_argument("--disable-cascade-attn", + **model_kwargs["disable_cascade_attn"]) + model_group.add_argument("--skip-tokenizer-init", + **model_kwargs["skip_tokenizer_init"]) + model_group.add_argument("--served-model-name", + **model_kwargs["served_model_name"]) + # This one is a special case because it is the + # opposite of ModelConfig.use_async_output_proc + model_group.add_argument( + "--disable-async-output-proc", + action="store_true", + default=EngineArgs.disable_async_output_proc, + help="Disable async output processing. This may result in " + "lower performance.") + model_group.add_argument("--config-format", + choices=[f.value for f in ConfigFormat], + **model_kwargs["config_format"]) + # This one is a special case because it can bool + # or str. TODO: Handle this in get_kwargs + model_group.add_argument("--hf-token", + type=str, + nargs="?", + const=True, + default=model_kwargs["hf_token"]["default"], + help=model_kwargs["hf_token"]["help"]) + model_group.add_argument("--hf-overrides", + **model_kwargs["hf_overrides"]) + model_group.add_argument("--override-neuron-config", + **model_kwargs["override_neuron_config"]) + model_group.add_argument("--override-pooler-config", + **model_kwargs["override_pooler_config"]) + model_group.add_argument("--logits-processor-pattern", + **model_kwargs["logits_processor_pattern"]) + model_group.add_argument("--generation-config", + **model_kwargs["generation_config"]) + model_group.add_argument("--override-generation-config", + **model_kwargs["override_generation_config"]) + model_group.add_argument("--enable-sleep-mode", + **model_kwargs["enable_sleep_mode"]) + model_group.add_argument("--model-impl", + choices=[f.value for f in ModelImpl], + **model_kwargs["model_impl"]) + # Model loading arguments load_kwargs = get_kwargs(LoadConfig) load_group = parser.add_argument_group( @@ -465,38 +481,6 @@ class EngineArgs: load_group.add_argument('--use-tqdm-on-load', **load_kwargs["use_tqdm_on_load"]) - parser.add_argument( - '--config-format', - default=EngineArgs.config_format, - choices=[f.value for f in ConfigFormat], - help='The format of the model config to load.\n\n' - '* "auto" will try to load the config in hf format ' - 'if available else it will try to load in mistral format ') - parser.add_argument( - '--dtype', - type=str, - default=EngineArgs.dtype, - choices=[ - 'auto', 'half', 'float16', 'bfloat16', 'float', 'float32' - ], - help='Data type for model weights and activations.\n\n' - '* "auto" will use FP16 precision for FP32 and FP16 models, and ' - 'BF16 precision for BF16 models.\n' - '* "half" for FP16. Recommended for AWQ quantization.\n' - '* "float16" is the same as "half".\n' - '* "bfloat16" for a balance between precision and range.\n' - '* "float" is shorthand for FP32 precision.\n' - '* "float32" for FP32 precision.') - parser.add_argument('--max-model-len', - type=human_readable_int, - default=EngineArgs.max_model_len, - help='Model context length. If unspecified, will ' - 'be automatically derived from the model config. ' - 'Supports k/m/g/K/M/G in human-readable format.\n' - 'Examples:\n' - '- 1k → 1000\n' - '- 1K → 1024\n') - # Guided decoding arguments guided_decoding_kwargs = get_kwargs(DecodingConfig) guided_decoding_group = parser.add_argument_group( @@ -520,26 +504,6 @@ class EngineArgs: choices=list(ReasoningParserManager.reasoning_parsers), **guided_decoding_kwargs["reasoning_backend"]) - parser.add_argument( - '--logits-processor-pattern', - type=optional_type(str), - default=None, - help='Optional regex pattern specifying valid logits processor ' - 'qualified names that can be passed with the `logits_processors` ' - 'extra completion argument. Defaults to None, which allows no ' - 'processors.') - parser.add_argument( - '--model-impl', - type=str, - default=EngineArgs.model_impl, - choices=[f.value for f in ModelImpl], - help='Which implementation of the model to use.\n\n' - '* "auto" will try to use the vLLM implementation if it exists ' - 'and fall back to the Transformers implementation if no vLLM ' - 'implementation is available.\n' - '* "vllm" will use the vLLM model implementation.\n' - '* "transformers" will use the Transformers model ' - 'implementation.\n') # Parallel arguments parallel_kwargs = get_kwargs(ParallelConfig) parallel_group = parser.add_argument_group( @@ -592,10 +556,6 @@ class EngineArgs: cache_group.add_argument('--calculate-kv-scales', **cache_kwargs["calculate_kv_scales"]) - parser.add_argument('--disable-sliding-window', - action='store_true', - help='Disables sliding window, ' - 'capping to sliding window size.') parser.add_argument('--use-v2-block-manager', action='store_true', default=True, @@ -605,73 +565,9 @@ class EngineArgs: 'Setting this flag to True or False' ' has no effect on vLLM behavior.') - parser.add_argument('--seed', - type=int, - default=EngineArgs.seed, - help='Random seed for operations.') - parser.add_argument( - '--max-logprobs', - type=int, - default=EngineArgs.max_logprobs, - help=('Max number of log probs to return logprobs is specified in' - ' SamplingParams.')) parser.add_argument('--disable-log-stats', action='store_true', help='Disable logging statistics.') - # Quantization settings. - parser.add_argument('--quantization', - '-q', - type=optional_type(str), - choices=[*QUANTIZATION_METHODS, None], - default=EngineArgs.quantization, - help='Method used to quantize the weights. If ' - 'None, we first check the `quantization_config` ' - 'attribute in the model config file. If that is ' - 'None, we assume the model weights are not ' - 'quantized and use `dtype` to determine the data ' - 'type of the weights.') - parser.add_argument( - '--rope-scaling', - default=None, - type=json.loads, - help='RoPE scaling configuration in JSON format. ' - 'For example, ``{"rope_type":"dynamic","factor":2.0}``') - parser.add_argument('--rope-theta', - default=None, - type=float, - help='RoPE theta. Use with `rope_scaling`. In ' - 'some cases, changing the RoPE theta improves the ' - 'performance of the scaled model.') - parser.add_argument( - '--hf-token', - type=str, - nargs='?', - const=True, - default=None, - help='The token to use as HTTP bearer authorization' - ' for remote files. If `True`, will use the token ' - 'generated when running `huggingface-cli login` ' - '(stored in `~/.huggingface`).') - parser.add_argument('--hf-overrides', - type=json.loads, - default=EngineArgs.hf_overrides, - help='Extra arguments for the HuggingFace config. ' - 'This should be a JSON string that will be ' - 'parsed into a dictionary.') - parser.add_argument('--enforce-eager', - action='store_true', - help='Always use eager-mode PyTorch. If False, ' - 'will use eager mode and CUDA graph in hybrid ' - 'for maximal performance and flexibility.') - parser.add_argument('--max-seq-len-to-capture', - type=int, - default=EngineArgs.max_seq_len_to_capture, - help='Maximum sequence length covered by CUDA ' - 'graphs. When a sequence has context length ' - 'larger than this, we fall back to eager mode. ' - 'Additionally for encoder-decoder models, if the ' - 'sequence length of the encoder input is larger ' - 'than this, we fall back to the eager mode.') # Tokenizer arguments tokenizer_kwargs = get_kwargs(TokenizerPoolConfig) @@ -775,20 +671,6 @@ class EngineArgs: "Default to `original/**/*` to avoid repeated loading of llama's " "checkpoints.") - parser.add_argument( - "--served-model-name", - nargs="+", - type=str, - default=None, - help="The model name(s) used in the API. If multiple " - "names are provided, the server will respond to any " - "of the provided names. The model name in the model " - "field of a response will be the first name in this " - "list. If not specified, the model name will be the " - "same as the ``--model`` argument. Noted that this name(s) " - "will also be used in `model_name` tag content of " - "prometheus metrics, if multiple names provided, metrics " - "tag will take the first one.") parser.add_argument('--qlora-adapter-name-or-path', type=str, default=None, @@ -822,13 +704,6 @@ class EngineArgs: "modules. This involves use of possibly costly and or blocking " "operations and hence might have a performance impact.") - parser.add_argument( - '--disable-async-output-proc', - action='store_true', - default=EngineArgs.disable_async_output_proc, - help="Disable async output processing. This may result in " - "lower performance.") - # Scheduler arguments scheduler_kwargs = get_kwargs(SchedulerConfig) scheduler_group = parser.add_argument_group( @@ -871,19 +746,6 @@ class EngineArgs: parser.add_argument('--scheduler-cls', **scheduler_kwargs["scheduler_cls"]) - parser.add_argument( - '--override-neuron-config', - type=json.loads, - default=None, - help="Override or set neuron device configuration. " - "e.g. ``{\"cast_logits_dtype\": \"bloat16\"}``.") - parser.add_argument( - '--override-pooler-config', - type=PoolerConfig.from_json, - default=None, - help="Override or set the pooling method for pooling models. " - "e.g. ``{\"pooling_type\": \"mean\", \"normalize\": false}``.") - parser.add_argument('--compilation-config', '-O', type=CompilationConfig.from_cli, @@ -920,34 +782,6 @@ class EngineArgs: help='The worker extension class on top of the worker cls, ' 'it is useful if you just want to add new functions to the worker ' 'class without changing the existing functions.') - parser.add_argument( - "--generation-config", - type=optional_type(str), - default="auto", - help="The folder path to the generation config. " - "Defaults to 'auto', the generation config will be loaded from " - "model path. If set to 'vllm', no generation config is loaded, " - "vLLM defaults will be used. If set to a folder path, the " - "generation config will be loaded from the specified folder path. " - "If `max_new_tokens` is specified in generation config, then " - "it sets a server-wide limit on the number of output tokens " - "for all requests.") - - parser.add_argument( - "--override-generation-config", - type=json.loads, - default=None, - help="Overrides or sets generation config in JSON format. " - "e.g. ``{\"temperature\": 0.5}``. If used with " - "--generation-config=auto, the override parameters will be merged " - "with the default config from the model. If generation-config is " - "None, only the override parameters are used.") - - parser.add_argument("--enable-sleep-mode", - action="store_true", - default=False, - help="Enable sleep mode for the engine. " - "(only cuda platform is supported)") parser.add_argument( "--additional-config", @@ -966,16 +800,6 @@ class EngineArgs: "If enabled, the model will be able to generate reasoning content." ) - parser.add_argument( - "--disable-cascade-attn", - action="store_true", - default=False, - help="Disable cascade attention for V1. While cascade attention " - "does not change the mathematical correctness, disabling it " - "could be useful for preventing potential numerical issues. " - "Note that even if this is set to False, cascade attention will be " - "only used when the heuristic tells that it's beneficial.") - return parser @classmethod @@ -1002,8 +826,7 @@ class EngineArgs: model=self.model, hf_config_path=self.hf_config_path, task=self.task, - # We know this is not None because we set it in __post_init__ - tokenizer=cast(str, self.tokenizer), + tokenizer=self.tokenizer, tokenizer_mode=self.tokenizer_mode, trust_remote_code=self.trust_remote_code, allowed_local_media_path=self.allowed_local_media_path, diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index f1f48c700..79f1d80f4 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -13,7 +13,7 @@ from typing_extensions import TypeVar, deprecated from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, BeamSearchSequence, get_beam_search_score) -from vllm.config import CompilationConfig +from vllm.config import CompilationConfig, ModelDType, TokenizerMode from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig, TaskOption) from vllm.engine.llm_engine import LLMEngine @@ -32,6 +32,7 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.guided_decoding.guided_fields import ( GuidedDecodingRequest, LLMGuidedOptions) +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.outputs import (ClassificationRequestOutput, EmbeddingRequestOutput, PoolingRequestOutput, RequestOutput, ScoringRequestOutput) @@ -163,20 +164,20 @@ class LLM: self, model: str, tokenizer: Optional[str] = None, - tokenizer_mode: str = "auto", + tokenizer_mode: TokenizerMode = "auto", skip_tokenizer_init: bool = False, trust_remote_code: bool = False, allowed_local_media_path: str = "", tensor_parallel_size: int = 1, - dtype: str = "auto", - quantization: Optional[str] = None, + dtype: ModelDType = "auto", + quantization: Optional[QuantizationMethods] = None, revision: Optional[str] = None, tokenizer_revision: Optional[str] = None, seed: Optional[int] = None, gpu_memory_utilization: float = 0.9, swap_space: float = 4, cpu_offload_gb: float = 0, - enforce_eager: Optional[bool] = None, + enforce_eager: bool = False, max_seq_len_to_capture: int = 8192, disable_custom_all_reduce: bool = False, disable_async_output_proc: bool = False, @@ -189,12 +190,7 @@ class LLM: compilation_config: Optional[Union[int, dict[str, Any]]] = None, **kwargs, ) -> None: - ''' - LLM constructor. - - Note: if enforce_eager is unset (enforce_eager is None) - it defaults to False. - ''' + """LLM constructor.""" if "disable_log_stats" not in kwargs: kwargs["disable_log_stats"] = True diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index 10f5241f9..0b74e8faf 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -12,6 +12,7 @@ from torch.nn.parameter import Parameter from vllm import _custom_ops as ops from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.utils import set_weight_attrs @@ -186,7 +187,7 @@ class AQLMConfig(QuantizationConfig): f"out_group_size={self.out_group_size})") @classmethod - def get_name(cls) -> str: + def get_name(cls) -> QuantizationMethods: return "aqlm" @classmethod diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 227be1497..cfc31ae20 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -7,6 +7,7 @@ import torch from vllm import _custom_ops as ops from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod) +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.parameter import (GroupQuantScaleParameter, @@ -44,7 +45,7 @@ class AWQConfig(QuantizationConfig): f"zero_point={self.zero_point}, " f"modules_to_not_convert={self.modules_to_not_convert})") - def get_name(self) -> str: + def get_name(self) -> QuantizationMethods: return "awq" def get_supported_act_dtypes(self) -> List[torch.dtype]: diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index ef4a7765d..193e90b85 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -13,6 +13,7 @@ from vllm.model_executor.layers.fused_moe.layer import ( from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod, set_weight_attrs) +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.awq import (AWQConfig, is_layer_skipped_awq) from vllm.model_executor.layers.quantization.base_config import ( @@ -73,7 +74,7 @@ class AWQMarlinConfig(QuantizationConfig): f"modules_to_not_convert={self.modules_to_not_convert})") @classmethod - def get_name(cls) -> str: + def get_name(cls) -> QuantizationMethods: return "awq_marlin" @classmethod @@ -101,8 +102,8 @@ class AWQMarlinConfig(QuantizationConfig): modules_to_not_convert, config) @classmethod - def override_quantization_method(cls, hf_quant_cfg, - user_quant) -> Optional[str]: + def override_quantization_method( + cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]: can_convert = cls.is_awq_marlin_compatible(hf_quant_cfg) is_valid_user_quant = (user_quant is None or user_quant == "marlin" or user_quant == "awq_marlin") diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py index 5ef11546f..8cf058b40 100644 --- a/vllm/model_executor/layers/quantization/base_config.py +++ b/vllm/model_executor/layers/quantization/base_config.py @@ -2,11 +2,16 @@ import inspect from abc import ABC, abstractmethod -from typing import Any, Dict, List, Optional, Type +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type import torch from torch import nn +if TYPE_CHECKING: + from vllm.model_executor.layers.quantization import QuantizationMethods +else: + QuantizationMethods = str + class QuantizeMethodBase(ABC): """Base class for different quantized methods.""" @@ -66,7 +71,7 @@ class QuantizationConfig(ABC): self.packed_modules_mapping: Dict[str, List[str]] = dict() @abstractmethod - def get_name(self) -> str: + def get_name(self) -> QuantizationMethods: """Name of the quantization method.""" raise NotImplementedError @@ -99,8 +104,8 @@ class QuantizationConfig(ABC): raise NotImplementedError @classmethod - def override_quantization_method(cls, hf_quant_cfg, - user_quant) -> Optional[str]: + def override_quantization_method( + cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]: """ Detects if this quantization method can support a given checkpoint format by overriding the user specified quantization method -- diff --git a/vllm/model_executor/layers/quantization/bitblas.py b/vllm/model_executor/layers/quantization/bitblas.py index 3eaaa6c25..ab858d720 100644 --- a/vllm/model_executor/layers/quantization/bitblas.py +++ b/vllm/model_executor/layers/quantization/bitblas.py @@ -5,6 +5,7 @@ import torch from vllm.logger import init_logger from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.quantization.utils.bitblas_utils import ( @@ -100,7 +101,7 @@ class BitBLASConfig(QuantizationConfig): f"quant_method={self.quant_method})") @classmethod - def get_name(cls) -> str: + def get_name(cls) -> QuantizationMethods: return "bitblas" @classmethod @@ -139,8 +140,8 @@ class BitBLASConfig(QuantizationConfig): lm_head_quantized) @classmethod - def override_quantization_method(cls, hf_quant_cfg, - user_quant) -> Optional[str]: + def override_quantization_method( + cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]: # compat: autogptq >=0.8.0 use checkpoint_format: str # compat: autogptq <=0.7.1 is_bitblas_format: bool is_bitblas_format = (hf_quant_cfg.get("checkpoint_format") == "bitblas" diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index f5d32efe8..a472779d9 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -7,6 +7,7 @@ import torch from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod, set_weight_attrs) +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.utils import direct_register_custom_op @@ -56,7 +57,7 @@ class BitsAndBytesConfig(QuantizationConfig): f"llm_int8_skip_modules={self.llm_int8_skip_modules})") @classmethod - def get_name(self) -> str: + def get_name(self) -> QuantizationMethods: return "bitsandbytes" @classmethod diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 5be6b22c7..0585c09bd 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -16,6 +16,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod) +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501 QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import ( # noqa: E501 @@ -71,7 +72,7 @@ class CompressedTensorsConfig(QuantizationConfig): def get_min_capability(cls) -> int: return 70 - def get_name(self) -> str: + def get_name(self) -> QuantizationMethods: return "compressed-tensors" def get_quant_method( diff --git a/vllm/model_executor/layers/quantization/deepspeedfp.py b/vllm/model_executor/layers/quantization/deepspeedfp.py index 67934d372..df7ec3376 100644 --- a/vllm/model_executor/layers/quantization/deepspeedfp.py +++ b/vllm/model_executor/layers/quantization/deepspeedfp.py @@ -7,6 +7,7 @@ import torch.nn as nn import torch.nn.functional as F from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.utils import set_weight_attrs @@ -41,8 +42,8 @@ class DeepSpeedFPConfig(QuantizationConfig): f"group_size={self.group_size}") @classmethod - def get_name(cls) -> str: - return "DeepSpeedFP" + def get_name(cls) -> QuantizationMethods: + return "deepspeedfp" @classmethod def from_config(cls, config: Dict[str, Any]) -> "DeepSpeedFPConfig": diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index be19b8097..cce95941b 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -8,6 +8,7 @@ from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase from vllm.model_executor.layers.linear import (LinearBase, UnquantizedLinearMethod) +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.utils import set_weight_attrs @@ -20,7 +21,7 @@ class ExpertsInt8Config(QuantizationConfig): super().__init__() @classmethod - def get_name(cls) -> str: + def get_name(cls) -> QuantizationMethods: return "experts_int8" @classmethod diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py index 7dddc40f3..1fa2b3a8e 100644 --- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py +++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py @@ -9,6 +9,7 @@ from torch.nn.parameter import Parameter from vllm.logger import init_logger from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod) +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( @@ -38,7 +39,7 @@ class FBGEMMFp8Config(QuantizationConfig): self.fp8_linear = Fp8LinearOp() @classmethod - def get_name(cls) -> str: + def get_name(cls) -> QuantizationMethods: return "fbgemm_fp8" @classmethod diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 01056c37b..5515ba27e 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -16,6 +16,7 @@ from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod) +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod @@ -83,7 +84,7 @@ class Fp8Config(QuantizationConfig): self.weight_block_size = weight_block_size @classmethod - def get_name(cls) -> str: + def get_name(cls) -> QuantizationMethods: return "fp8" @classmethod diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index 6b499f81c..05058dfaa 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -13,6 +13,7 @@ from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, FusedMoEMethodBase) from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -31,7 +32,7 @@ class GGUFConfig(QuantizationConfig): def __repr__(self) -> str: return ("GGUFConfig()") - def get_name(self) -> str: + def get_name(self) -> QuantizationMethods: return "gguf" def get_supported_act_dtypes(self) -> List[torch.dtype]: diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index 1c8d6cb1e..5059e0cdf 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -10,6 +10,7 @@ from torch.nn.parameter import Parameter from vllm import _custom_ops as ops from vllm.model_executor.layers.linear import LinearMethodBase +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.quantization.utils.gptq_utils import ( @@ -79,7 +80,7 @@ class GPTQConfig(QuantizationConfig): f"dynamic={self.dynamic}") @classmethod - def get_name(cls) -> str: + def get_name(cls) -> QuantizationMethods: return "gptq" @classmethod diff --git a/vllm/model_executor/layers/quantization/gptq_bitblas.py b/vllm/model_executor/layers/quantization/gptq_bitblas.py index 88cada4c6..891d8cdf3 100644 --- a/vllm/model_executor/layers/quantization/gptq_bitblas.py +++ b/vllm/model_executor/layers/quantization/gptq_bitblas.py @@ -7,6 +7,7 @@ from torch.nn.parameter import Parameter from vllm.logger import init_logger from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, set_weight_attrs) +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.quantization.kernels.mixed_precision import ( @@ -123,7 +124,7 @@ class GPTQBitBLASConfig(QuantizationConfig): f"quant_method={self.quant_method})") @classmethod - def get_name(cls) -> str: + def get_name(cls) -> QuantizationMethods: return "gptq_bitblas" @classmethod @@ -151,8 +152,8 @@ class GPTQBitBLASConfig(QuantizationConfig): lm_head_quantized) @classmethod - def override_quantization_method(cls, hf_quant_cfg, - user_quant) -> Optional[str]: + def override_quantization_method( + cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]: can_convert = cls.is_gptq_bitblas_compatible(hf_quant_cfg) is_valid_user_quant = (user_quant is None or user_quant == "bitblas" diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 52cd0a5b6..c7f9d95f4 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -11,6 +11,7 @@ from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) from vllm.model_executor.layers.linear import (LinearMethodBase, set_weight_attrs) +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.kernels.mixed_precision import ( @@ -100,7 +101,7 @@ class GPTQMarlinConfig(QuantizationConfig): f"dynamic={self.dynamic}") @classmethod - def get_name(cls) -> str: + def get_name(cls) -> QuantizationMethods: return "gptq_marlin" @classmethod @@ -130,8 +131,8 @@ class GPTQMarlinConfig(QuantizationConfig): lm_head_quantized, dynamic, config) @classmethod - def override_quantization_method(cls, hf_quant_cfg, - user_quant) -> Optional[str]: + def override_quantization_method( + cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]: can_convert = cls.is_gptq_marlin_compatible(hf_quant_cfg) is_valid_user_quant = (user_quant is None or user_quant == "marlin" diff --git a/vllm/model_executor/layers/quantization/gptq_marlin_24.py b/vllm/model_executor/layers/quantization/gptq_marlin_24.py index dd747e182..1fe08e4b3 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin_24.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin_24.py @@ -8,6 +8,7 @@ from torch.nn.parameter import Parameter from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.parameter import (BasevLLMParameter, @@ -85,7 +86,7 @@ class GPTQMarlin24Config(QuantizationConfig): self.quant_type, self.group_size) @classmethod - def get_name(cls) -> str: + def get_name(cls) -> QuantizationMethods: return "gptq_marlin_24" @classmethod @@ -108,8 +109,8 @@ class GPTQMarlin24Config(QuantizationConfig): return cls(weight_bits, group_size) @classmethod - def override_quantization_method(cls, hf_quant_cfg, - user_quant) -> Optional[str]: + def override_quantization_method( + cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]: is_marlin_24_format = ( hf_quant_cfg.get("checkpoint_format") == "marlin_24") diff --git a/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py index 4edc9aa84..7bd398137 100644 --- a/vllm/model_executor/layers/quantization/hqq_marlin.py +++ b/vllm/model_executor/layers/quantization/hqq_marlin.py @@ -8,6 +8,7 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod) +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( @@ -50,7 +51,7 @@ class HQQMarlinConfig(QuantizationConfig): f"group_size={self.group_size})") @classmethod - def get_name(cls) -> str: + def get_name(cls) -> QuantizationMethods: return "hqq" @classmethod diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index c09cc13cb..212af278f 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -6,6 +6,7 @@ import torch from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod) +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.awq import (AWQLinearMethod, is_layer_skipped_awq) from vllm.model_executor.layers.quantization.base_config import ( @@ -58,7 +59,7 @@ class IPEXConfig(QuantizationConfig): f"group_size={self.group_size})") @classmethod - def get_name(cls) -> str: + def get_name(cls) -> QuantizationMethods: return "ipex" @classmethod @@ -97,8 +98,8 @@ class IPEXConfig(QuantizationConfig): lm_head_quantized) @classmethod - def override_quantization_method(cls, hf_quant_cfg, - user_quant) -> Optional[str]: + def override_quantization_method( + cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]: if not current_platform.is_cpu() and not current_platform.is_xpu(): return None diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py index 4cf0c677c..9ef71a789 100644 --- a/vllm/model_executor/layers/quantization/marlin.py +++ b/vllm/model_executor/layers/quantization/marlin.py @@ -8,6 +8,7 @@ from torch.nn.parameter import Parameter from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead @@ -63,7 +64,7 @@ class MarlinConfig(QuantizationConfig): f"lm_head_quantized={self.lm_head_quantized})") @classmethod - def get_name(cls) -> str: + def get_name(cls) -> QuantizationMethods: return "marlin" @classmethod @@ -87,8 +88,8 @@ class MarlinConfig(QuantizationConfig): return cls(group_size, lm_head_quantized) @classmethod - def override_quantization_method(cls, hf_quant_cfg, - user_quant) -> Optional[str]: + def override_quantization_method( + cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]: # compat: autogptq >=0.8.0 use checkpoint_format: str # compat: autogptq <=0.7.1 is_marlin_format: bool is_marlin_format = (hf_quant_cfg.get("checkpoint_format") == "marlin" diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 3de153699..828447dd1 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -11,6 +11,7 @@ from vllm._custom_ops import (cutlass_scaled_fp4_mm, from vllm.logger import init_logger from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod) +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod @@ -42,7 +43,7 @@ class ModelOptFp8Config(QuantizationConfig): " the format is experimental and could change.") @classmethod - def get_name(cls) -> str: + def get_name(cls) -> QuantizationMethods: return "modelopt" @classmethod @@ -184,8 +185,8 @@ class ModelOptNvFp4Config(QuantizationConfig): self.exclude_modules = exclude_modules @classmethod - def get_name(cls) -> str: - return "modelopt_nvfp4" + def get_name(cls) -> QuantizationMethods: + return "nvfp4" @classmethod def get_supported_act_dtypes(cls) -> List[torch.dtype]: diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index 00c4b661e..b8e3a4364 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -9,6 +9,7 @@ from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) from vllm.model_executor.layers.linear import (LinearBase, UnquantizedLinearMethod) +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( @@ -64,7 +65,7 @@ class MoeWNA16Config(QuantizationConfig): self.modules_to_not_convert = modules_to_not_convert @classmethod - def get_name(cls) -> str: + def get_name(cls) -> QuantizationMethods: return "moe_wna16" @classmethod @@ -100,8 +101,8 @@ class MoeWNA16Config(QuantizationConfig): lm_head_quantized, modules_to_not_convert, config) @classmethod - def override_quantization_method(cls, hf_quant_cfg, - user_quant) -> Optional[str]: + def override_quantization_method( + cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]: can_convert = cls.is_moe_wna16_compatible(hf_quant_cfg) if can_convert and user_quant == "moe_wna16": return cls.get_name() diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py index f6f66803f..7933eab2a 100644 --- a/vllm/model_executor/layers/quantization/neuron_quant.py +++ b/vllm/model_executor/layers/quantization/neuron_quant.py @@ -6,6 +6,7 @@ from typing import Any, Dict, List, Optional from torch.nn import Module +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) @@ -30,7 +31,7 @@ class NeuronQuantConfig(QuantizationConfig): self.dequant_dtype = dequant_dtype self.quantize_method = quantize_method - def get_name(self) -> str: + def get_name(self) -> QuantizationMethods: return "neuron_quant" def get_supported_act_dtypes(self) -> List[str]: diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py index 592ffc5da..004d74e68 100644 --- a/vllm/model_executor/layers/quantization/ptpc_fp8.py +++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py @@ -9,6 +9,7 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.linear import (LinearBase, UnquantizedLinearMethod) +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizeMethodBase) from vllm.model_executor.layers.quantization.fp8 import (Fp8Config, @@ -50,7 +51,7 @@ class PTPCFp8Config(Fp8Config): ignored_layers=ignored_layers) @classmethod - def get_name(cls) -> str: + def get_name(cls) -> QuantizationMethods: return "ptpc_fp8" @classmethod diff --git a/vllm/model_executor/layers/quantization/qqq.py b/vllm/model_executor/layers/quantization/qqq.py index 1e05917a5..06ff6c71b 100644 --- a/vllm/model_executor/layers/quantization/qqq.py +++ b/vllm/model_executor/layers/quantization/qqq.py @@ -8,6 +8,7 @@ from torch.nn.parameter import Parameter from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.parameter import (BasevLLMParameter, @@ -84,7 +85,7 @@ class QQQConfig(QuantizationConfig): self.weight_bits, self.group_size) @classmethod - def get_name(cls) -> str: + def get_name(cls) -> QuantizationMethods: return "qqq" @classmethod diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py index cf9108ea7..da2312190 100644 --- a/vllm/model_executor/layers/quantization/quark/quark.py +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -8,6 +8,7 @@ import torch from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod) +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501 QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod @@ -47,7 +48,7 @@ class QuarkConfig(QuantizationConfig): def get_min_capability(cls) -> int: return 70 - def get_name(self) -> str: + def get_name(self) -> QuantizationMethods: return "quark" def get_quant_method(self, layer: torch.nn.Module, diff --git a/vllm/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py index 5c2babcf4..751002fa0 100644 --- a/vllm/model_executor/layers/quantization/torchao.py +++ b/vllm/model_executor/layers/quantization/torchao.py @@ -6,6 +6,7 @@ import torch.nn.functional as F from torch.nn.parameter import Parameter from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.utils import set_weight_attrs @@ -20,7 +21,7 @@ class TorchAOConfig(QuantizationConfig): def __repr__(self) -> str: return f"TorchAOConfig({self.torchao_config})" - def get_name(self) -> str: + def get_name(self) -> QuantizationMethods: return "torchao" def get_supported_act_dtypes(self) -> List[torch.dtype]: diff --git a/vllm/model_executor/layers/quantization/tpu_int8.py b/vllm/model_executor/layers/quantization/tpu_int8.py index 14e5bcf6e..8333c16ce 100644 --- a/vllm/model_executor/layers/quantization/tpu_int8.py +++ b/vllm/model_executor/layers/quantization/tpu_int8.py @@ -7,6 +7,7 @@ from torch.nn import Module from torch.nn.parameter import Parameter from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.parameter import ModelWeightParameter @@ -27,7 +28,7 @@ class Int8TpuConfig(QuantizationConfig): f"Unsupported activation scheme {activation_scheme}") self.activation_scheme = activation_scheme - def get_name(self) -> str: + def get_name(self) -> QuantizationMethods: return "tpu_int8" def get_supported_act_dtypes(self) -> List[torch.dtype]: diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index c5970c71c..00f4e66bd 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -1496,7 +1496,7 @@ def get_rope( if key in _ROPE_DICT: return _ROPE_DICT[key] - if rope_scaling is None: + if not rope_scaling: rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base, is_neox_style, dtype) else: diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py index 67aaad10f..a7b313f4e 100644 --- a/vllm/model_executor/model_loader/neuron.py +++ b/vllm/model_executor/model_loader/neuron.py @@ -180,7 +180,6 @@ def _get_neuron_config_after_override(default_neuron_config, NeuronConfig, QuantizationConfig, SparseAttnConfig) - overridden_neuron_config = overridden_neuron_config or {} sparse_attn = overridden_neuron_config.pop("sparse_attn", {}) if sparse_attn: overridden_neuron_config["sparse_attn"] = SparseAttnConfig( -- GitLab From d1f569b1b9ce37aa77873b7f2bdb73612c4d9f23 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 30 Apr 2025 03:39:18 +0100 Subject: [PATCH 047/461] Fix call to `logger.info_once` (#17416) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/guided_decoding/xgrammar_decoding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py index 40f722410..c63acfdde 100644 --- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py +++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py @@ -191,9 +191,9 @@ class GrammarConfig: if model_with_warn is not None and any_whitespace: logger.info_once( - "%s model detected, consider setting " + f"{model_with_warn} model detected, consider setting " "`disable_any_whitespace` to prevent runaway generation " - "of whitespaces.", model_with_warn) + "of whitespaces.") # Validate the schema and raise ValueError here if it is invalid. # This is to avoid exceptions in model execution, which will crash # the engine worker process. -- GitLab From 88fcf00ddaa99e9eb1da58c4d46dc5bf59bbf42d Mon Sep 17 00:00:00 2001 From: Huy Do Date: Tue, 29 Apr 2025 19:41:02 -0700 Subject: [PATCH 048/461] Fix some speculative decode tests with tl.dot (#17371) Signed-off-by: Huy Do --- tests/spec_decode/e2e/test_multistep_correctness.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py index bb45be791..e187b6bc1 100644 --- a/tests/spec_decode/e2e/test_multistep_correctness.py +++ b/tests/spec_decode/e2e/test_multistep_correctness.py @@ -456,7 +456,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( @pytest.mark.parametrize( "common_llm_kwargs", [{ - "block_size": 8, + "block_size": 16, # 2 for small prompt, 256//8 for generated. "num_gpu_blocks_override": 2 + 256 // 8, "max_model_len": (2 + 256 // 8) * 8, @@ -526,11 +526,8 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption( @pytest.mark.parametrize( "per_test_common_llm_kwargs", [ - # As of this writing, vLLM only compiles with these 3 block sizes by - # default. - { - "block_size": 8, - }, + # https://github.com/triton-lang/triton/issues/2266 tl.dot + # doesn't support embedding < 16 { "block_size": 16, }, -- GitLab From a44c4f1d2f7cb882e0045b0c7d7cbcf8e08ef9bd Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 29 Apr 2025 22:10:30 -0600 Subject: [PATCH 049/461] Support LoRA for Mistral3 (#17428) Signed-off-by: mgoin --- docs/source/models/supported_models.md | 2 +- vllm/model_executor/models/mistral3.py | 17 ++++++++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 95e7d5d60..8489ebe71 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -990,7 +990,7 @@ See [this page](#generative-models) for more information on how to use generativ * Mistral3 * T + I+ * `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. - * + * ✅︎ * ✅︎ * ✅︎ - * `MllamaForConditionalGeneration` diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 12c87dc0f..c9abe4142 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -18,6 +18,7 @@ from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -31,7 +32,8 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors -from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP +from .interfaces import (MultiModalEmbeddings, SupportsLoRA, + SupportsMultiModal, SupportsPP) from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -382,8 +384,8 @@ def init_vision_tower_for_llava( _build_mistral3_processor, info=_build_mistral3_info, dummy_inputs=Mistral3DummyInputsBuilder) -class Mistral3ForConditionalGeneration(nn.Module, SupportsMultiModal, - SupportsPP): +class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA, + SupportsMultiModal, SupportsPP): packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], @@ -594,3 +596,12 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsMultiModal, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="language_model", + connector="multi_modal_projector", + tower_model="vision_tower") -- GitLab From 6ed9f6047e19abbb5c3e57cd083aa318b30eec02 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Wed, 30 Apr 2025 13:54:10 +0800 Subject: [PATCH 050/461] [Intel GPU] [CI]Fix XPU ci, setuptools >=80.0 have build issue (#17298) Signed-off-by: Kunshang Ji --- requirements/xpu.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/xpu.txt b/requirements/xpu.txt index fa09004d0..723ffcfc9 100644 --- a/requirements/xpu.txt +++ b/requirements/xpu.txt @@ -5,7 +5,7 @@ ray>=2.9 cmake>=3.26 packaging setuptools-scm>=8 -setuptools>=75.8.0 +setuptools>=75.8.0,<80.0.0 wheel jinja2>=3.1.6 datasets # for benchmark scripts -- GitLab From ed6cfb90c8ad13e77dcbfa0e211075a3e2f1ee7e Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Wed, 30 Apr 2025 15:03:58 +0800 Subject: [PATCH 051/461] [Hardware][Intel GPU] Upgrade to torch 2.7 (#17444) Signed-off-by: Kunshang Ji Co-authored-by: Qiming Zhang --- docker/Dockerfile.xpu | 6 ------ .../installation/gpu/xpu.inc.md | 9 --------- requirements/xpu.txt | 6 +++--- vllm/_ipex_ops.py | 18 +++++++++--------- vllm/attention/backends/ipex_attn.py | 14 ++++++-------- 5 files changed, 18 insertions(+), 35 deletions(-) diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu index ad4abf16b..681102b9d 100644 --- a/docker/Dockerfile.xpu +++ b/docker/Dockerfile.xpu @@ -40,12 +40,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=bind,source=.git,target=.git \ python3 setup.py install -# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu -# FIXME: This will be fix in ipex 2.7. just leave this here for awareness. -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install intel-extension-for-pytorch==2.6.10+xpu \ - --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - CMD ["/bin/bash"] FROM vllm-base AS vllm-openai diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/source/getting_started/installation/gpu/xpu.inc.md index fbf5421ee..4ab41a21c 100644 --- a/docs/source/getting_started/installation/gpu/xpu.inc.md +++ b/docs/source/getting_started/installation/gpu/xpu.inc.md @@ -35,13 +35,6 @@ pip install -v -r requirements/xpu.txt VLLM_TARGET_DEVICE=xpu python setup.py install ``` -- Finally, due to a known issue of conflict dependency(oneapi related) in torch-xpu 2.6 and ipex-xpu 2.6, we install ipex here. This will be fixed in the ipex-xpu 2.7. - -```console -pip install intel-extension-for-pytorch==2.6.10+xpu \ - --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ -``` - :::{note} - FP16 is the default data type in the current XPU backend. The BF16 data type is supported on Intel Data Center GPU, not supported on Intel Arc GPU yet. @@ -81,5 +74,3 @@ python -m vllm.entrypoints.openai.api_server \ ``` By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script. - -There are some new features coming with ipex-xpu 2.6, e.g. **chunked prefill**, **V1 engine support**, **lora**, **MoE**, etc. diff --git a/requirements/xpu.txt b/requirements/xpu.txt index 723ffcfc9..d9f2c007e 100644 --- a/requirements/xpu.txt +++ b/requirements/xpu.txt @@ -10,7 +10,7 @@ wheel jinja2>=3.1.6 datasets # for benchmark scripts -torch==2.6.0+xpu +torch==2.7.0+xpu torchaudio torchvision pytorch-triton-xpu @@ -18,6 +18,6 @@ pytorch-triton-xpu # Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu # FIXME: This will be fix in ipex 2.7. just leave this here for awareness. -# intel-extension-for-pytorch==2.6.10+xpu -oneccl_bind_pt==2.6.0+xpu +intel-extension-for-pytorch==2.7.10+xpu +oneccl_bind_pt==2.7.0+xpu --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index c3d210c27..505ebec34 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -177,6 +177,7 @@ class ipex_ops: out: torch.Tensor, seqlen_q: torch.Tensor, seqlen_k: torch.Tensor, + alibi_slopes: torch.Tensor, max_seqlen_q: int, max_seqlen_k: int, pdropout: float, @@ -185,6 +186,8 @@ class ipex_ops: is_causal: bool, return_softmax: bool, gen_: torch.Generator, + window_size_left: float, + window_size_right: float, logits_soft_cap: float, ) -> None: if ipex.__version__.endswith("cpu"): @@ -200,15 +203,12 @@ class ipex_ops: is_causal, return_softmax, gen_) else: # XPU build - ipex.llm.functional.varlen_attention(query.contiguous(), - key.contiguous(), - value.contiguous(), out, - seqlen_q.int(), - seqlen_k.int(), max_seqlen_q, - max_seqlen_k, pdropout, - softmax_scale, zero_tensors, - is_causal, return_softmax, - gen_, logits_soft_cap) + ipex.llm.functional.varlen_attention( + query.contiguous(), key.contiguous(), value.contiguous(), out, + seqlen_q.int(), seqlen_k.int(), alibi_slopes, max_seqlen_q, + max_seqlen_k, pdropout, softmax_scale, zero_tensors, is_causal, + return_softmax, gen_, window_size_left, window_size_right, + logits_soft_cap) @staticmethod def reshape_and_cache( diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index 27959caa6..f322c7b3d 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -143,10 +143,9 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads - self.need_mask = (self.alibi_slopes is not None - or self.sliding_window is not None) + self.need_mask = (self.sliding_window is not None) if logits_soft_cap is None: - logits_soft_cap = 0 + logits_soft_cap = -1 self.logits_soft_cap = logits_soft_cap supported_head_sizes = PagedAttention.get_supported_head_sizes() @@ -234,11 +233,7 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): dim=1) if attn_metadata.attn_bias is None: - if self.alibi_slopes is not None: - att_masks = _make_alibi_bias( - self.alibi_slopes, query.dtype, - attn_metadata.seq_lens) # type: ignore - elif self.sliding_window is not None: + if self.sliding_window is not None: att_masks = _make_sliding_window_bias( attn_metadata.seq_lens, self.sliding_window, query.dtype) # type: ignore @@ -258,6 +253,7 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): output, attn_metadata.seqlen_q, attn_metadata.seqlen_q, + self.alibi_slopes, attn_metadata.max_seqlen, attn_metadata.max_seqlen, pdropout=0.0, @@ -266,6 +262,8 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): is_causal=True, return_softmax=False, gen_=None, + window_size_left=-1, + window_size_right=-1, logits_soft_cap=self.logits_soft_cap, ) else: -- GitLab From be633fba0f8fc41b19a774a89ad055e54865af53 Mon Sep 17 00:00:00 2001 From: Chauncey Date: Wed, 30 Apr 2025 15:11:04 +0800 Subject: [PATCH 052/461] [Bugfix] Fix AttributeError: 'State' object has no attribute 'engine_client' (#17434) Signed-off-by: chaunceyjiang --- vllm/entrypoints/api_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index c81ff9585..1c0271811 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -111,7 +111,7 @@ async def init_app( engine = (llm_engine if llm_engine is not None else AsyncLLMEngine.from_engine_args( engine_args, usage_context=UsageContext.API_SERVER)) - + app.state.engine_client = engine return app -- GitLab From 54072f315f8f39f906e59b2153e05e9cf7cd524b Mon Sep 17 00:00:00 2001 From: Marco <121761685+mlinmg@users.noreply.github.com> Date: Wed, 30 Apr 2025 09:33:29 +0200 Subject: [PATCH 053/461] [MODEL ADDITION] Ovis2 Model Addition (#15826) Signed-off-by: Marco <121761685+mlinmg@users.noreply.github.com> Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py --- docs/source/models/supported_models.md | 7 + examples/offline_inference/vision_language.py | 31 ++ .../vision_language_multi_image.py | 31 ++ .../vision_language/test_models.py | 12 + .../vision_language/vlm_utils/core.py | 2 +- .../vision_language/vlm_utils/model_utils.py | 30 ++ .../multimodal/processing/test_common.py | 1 + tests/models/registry.py | 4 + vllm/entrypoints/chat_utils.py | 5 +- vllm/model_executor/models/aimv2.py | 322 ++++++++++++++ vllm/model_executor/models/ovis2.py | 331 +++++++++++++++ vllm/model_executor/models/registry.py | 1 + vllm/transformers_utils/config.py | 7 +- vllm/transformers_utils/configs/__init__.py | 2 + vllm/transformers_utils/configs/ovis2.py | 170 ++++++++ .../transformers_utils/processors/__init__.py | 3 +- vllm/transformers_utils/processors/ovis2.py | 397 ++++++++++++++++++ 17 files changed, 1349 insertions(+), 7 deletions(-) create mode 100644 vllm/model_executor/models/aimv2.py create mode 100644 vllm/model_executor/models/ovis2.py create mode 100644 vllm/transformers_utils/configs/ovis2.py create mode 100644 vllm/transformers_utils/processors/ovis2.py diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 8489ebe71..831f9a86d 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -1014,6 +1014,13 @@ See [this page](#generative-models) for more information on how to use generativ * * ✅︎ * ✅︎ +- * `Ovis2ForConditionalGeneration`^ + * Ovis2 + * T + I+ + * `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis2-2B`, etc. + * + * + * ✅︎ - * `PaliGemmaForConditionalGeneration` * PaliGemma, PaliGemma 2 * T + IE diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index d02ac17cf..d455ea2de 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -725,6 +725,36 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData: ) +# Ovis2 +def run_ovis2(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + + model_name = "AIDC-AI/Ovis2-1B" + tokenizer = "Isotr0py/Ovis2-tokenizer" + + engine_args = EngineArgs( + model=model_name, + tokenizer=tokenizer, + max_model_len=4096, + max_num_seqs=2, + trust_remote_code=True, + dtype="half", + hf_overrides={"architectures": ["Ovis2ForConditionalGeneration"]}, + limit_mm_per_prompt={"image": 1}, + ) + + placeholder = "\n" + prompts = [("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + f"<|im_start|>user\n{placeholder}" + f"{question}<|im_end|>\n" + "<|im_start|>assistant\n") for question in questions] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # PaliGemma def run_paligemma(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1041,6 +1071,7 @@ model_example_map = { "llama4": run_llama4, "molmo": run_molmo, "NVLM_D": run_nvlm_d, + "ovis2": run_ovis2, "paligemma": run_paligemma, "paligemma2": run_paligemma2, "phi3_v": run_phi3v, diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 7f6608559..f16033993 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -436,6 +436,36 @@ def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData: ) +# Ovis2 +def load_ovis2(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "AIDC-AI/Ovis2-1B" + tokenizer = "Isotr0py/Ovis2-tokenizer" + + engine_args = EngineArgs( + model=model_name, + tokenizer=tokenizer, + max_model_len=8192, + max_num_seqs=2, + trust_remote_code=True, + dtype="half", + limit_mm_per_prompt={"image": len(image_urls)}, + hf_overrides={"architectures": ["Ovis2ForConditionalGeneration"]}, + ) + + placeholder = '\n'.join( + [f'Image {i+1}: ' for i in range(len(image_urls))]) + '\n' + prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + f"<|im_start|>user\n{placeholder}" + f"{question}<|im_end|>\n" + "<|im_start|>assistant\n") + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "mistral-community/pixtral-12b" @@ -685,6 +715,7 @@ model_example_map = { "mistral3": load_mistral3, "mllama": load_mllama, "NVLM_D": load_nvlm_d, + "ovis2": load_ovis2, "phi3_v": load_phi3v, "phi4_mm": load_phi4mm, "pixtral_hf": load_pixtral_hf, diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 6073364c0..3dd82b93f 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -467,6 +467,18 @@ VLM_TEST_SETTINGS = { max_num_seqs=2, patch_hf_runner=model_utils.molmo_patch_hf_runner, ), + "ovis2": VLMTestInfo( + models=["AIDC-AI/Ovis2-1B"], + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 + img_idx_to_prompt=lambda idx: "\n", # noqa: E501 + max_model_len=4096, + max_num_seqs=2, + dtype="half", + # use sdpa mode for hf runner since ovis2 didn't work with flash_attn + hf_model_kwargs={"llm_attn_implementation": "sdpa"}, + patch_hf_runner=model_utils.ovis2_patch_hf_runner, + ), "phi3v": VLMTestInfo( models=["microsoft/Phi-3.5-vision-instruct"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py index fd046f3cd..c3d20f568 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/core.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py @@ -67,7 +67,7 @@ def run_test( "disable_mm_preprocessor_cache": True, } if model_info.tokenizer: - vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer + vllm_runner_kwargs_["tokenizer_name"] = model_info.tokenizer if model_info.tokenizer_mode: vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode if model_info.hf_overrides: diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py index 1185d80b9..c856fb198 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py @@ -676,3 +676,33 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner: hf_model.model.generate = types.MethodType(_generate, hf_model.model) return hf_model + + +def ovis2_patch_hf_runner(hf_model: HfRunner) -> HfRunner: + """Patches and returns an instance of the HfRunner to use for Ovis2.""" + hf_model.model.visual_tokenizer.to(hf_model.dtype) + hf_model.model.vte.to(hf_model.dtype) + hf_model.model.llm.to(hf_model.dtype) + + hf_model.model.get_output_embeddings = lambda: \ + hf_model.model.llm.get_output_embeddings() + + def processor(*args, text="", images=None, **kwargs): + text_tokenizer = hf_model.model.get_text_tokenizer() + images = [images] if isinstance(images, Image) else images + + text = text.split("<|im_start|>user\n")[1].split("<|im_end|>\n")[0] + + prompt, input_ids, pixel_values = hf_model.model.preprocess_inputs( + text_or_conversations=text, images=images) + attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id) + + inputs = { + "inputs": input_ids.unsqueeze(0), + "pixel_values": pixel_values.unsqueeze(0), + "attention_mask": attention_mask.unsqueeze(0), + } + return BatchFeature(data=inputs, tensor_type="pt") + + hf_model.processor = processor + return hf_model diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 4dc49d18c..2b1d38dfd 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -274,6 +274,7 @@ def _test_processing_correctness_mistral( "allenai/Molmo-7B-D-0924", "allenai/Molmo-7B-O-0924", "nvidia/NVLM-D-72B", + "AIDC-AI/Ovis2-1B", "google/paligemma-3b-mix-224", "google/paligemma2-3b-ft-docci-448", "microsoft/Phi-4-multimodal-instruct", diff --git a/tests/models/registry.py b/tests/models/registry.py index f17f70189..2dbe1a41f 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -348,6 +348,10 @@ _MULTIMODAL_EXAMPLE_MODELS = { max_transformers_version="4.48", transformers_version_reason="Use of deprecated imports which have been removed.", # noqa: E501 extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"}), # noqa: E501 + "Ovis2ForConditionalGeneration": _HfExamplesInfo("AIDC-AI/Ovis2-1B", + tokenizer="Isotr0py/Ovis2-tokenizer", + trust_remote_code=True, + hf_overrides={"architectures": ["Ovis2ForConditionalGeneration"]}), # noqa: E501 "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct", trust_remote_code=True), "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409", # noqa: E501 diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index fcaa24eec..23dded7f2 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -496,9 +496,10 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): if model_type.startswith("llava"): return self._cached_token_str(self._tokenizer, hf_config.image_token_index) + if model_type in ("aya_vision", "chameleon", "deepseek_vl_v2", - "internvl_chat", "skywork_chat", "NVLM_D", - "h2ovl_chat", "idefics3", "smolvlm"): + "internvl_chat", "ovis2", "skywork_chat", + "NVLM_D", "h2ovl_chat", "idefics3", "smolvlm"): return "" if model_type in ("mllama", "llama4"): return "<|image|>" diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py new file mode 100644 index 000000000..730e770dc --- /dev/null +++ b/vllm/model_executor/models/aimv2.py @@ -0,0 +1,322 @@ +# SPDX-License-Identifier: Apache-2.0 + +# A modified implementation of the AIMv2 Transformer +# inserted here also the image tokenizer used by Ovis2 +from typing import Optional + +import torch +from torch import nn, softmax +from torch.nn import functional as F +from torch.nn.functional import gumbel_softmax, pad + +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import ReplicatedLinear +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.transformers_utils.configs.ovis2 import (AIMv2Config, + Aimv2VisualTokenizerConfig) + +IMAGE_INDICATOR_IDS = [-301, -302, -303, -304, + -305] # kept for vocab prefixed tokens + + +def st_argmax(y_soft: torch.Tensor, dim: int): # straight-through softmax + index = y_soft.max(dim, keepdim=True)[1] + y_hard = torch.zeros_like( + y_soft, memory_format=torch.legacy_contiguous_format).scatter_( + dim, index, 1.0) + ret = y_hard - y_soft.detach() + y_soft + return ret + + +class Aimv2VisualTokenizer(torch.nn.Module): + + def __init__(self, + config: Aimv2VisualTokenizerConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + **kwargs): + super().__init__() + self.config = config + self.backbone = AIMv2Model( + config=config.backbone_config, # noqa + quant_config=quant_config, + prefix=f"{prefix}.visual_tokenizer") + # reserved tokens for IMAGE_INDICATORS + head_dim = config.vocab_size - len(IMAGE_INDICATOR_IDS) + self.head = torch.nn.Sequential( + ReplicatedLinear( + config.backbone_config.hidden_size * config.hidden_stride * + config.hidden_stride, + head_dim, + bias=False, + ), torch.nn.LayerNorm(head_dim)) + + @property + def dtype(self): + return self.backbone.dtype + + @property + def device(self): + return self.backbone.device + + def tokenize(self, logits): + if self.config.tokenize_function == 'softmax': + tokens = softmax(logits, dim=-1) + elif self.config.tokenize_function == 'gumbel_argmax': + tokens = gumbel_softmax(logits, tau=self.config.tau, hard=True) + elif self.config.tokenize_function == 'st_argmax': + tokens = st_argmax(logits, dim=-1) + else: + raise ValueError( + 'Invalid `max_type`, expected softmax or gumbel_argmax ' + f'or st_argmax, but got {self.config.tokenize_function}') + return tokens + + def encode(self, pixel_values): + features = self.backbone(pixel_values) + if self.config.drop_cls_token: + features = features[:, 1:, :] + + # merge number of `hidden_stride * hidden_stride` hidden states together + # to reduce token sequence length + # e.g., for hidden_stride=2, this leads to a token length reduction: + # 1024 -> 256 for aimv2 + if self.config.hidden_stride > 1: + # this `d` maybe different from the above `d`` + n, L, d = features.shape + sqrt_l = int(L**0.5) + assert sqrt_l**2 == L, ( + "The token sequence length should be a perfect square.") + features = features.reshape(n, sqrt_l, sqrt_l, d) + pl = (self.config.hidden_stride - + (sqrt_l % + self.config.hidden_stride)) % self.config.hidden_stride + features = pad(features, (0, 0, 0, pl, 0, pl), "constant", 0) + sqrt_l += pl + features = features.reshape(n, sqrt_l // self.config.hidden_stride, + self.config.hidden_stride, + sqrt_l // self.config.hidden_stride, + self.config.hidden_stride, d) + # [n, sqrt_l/hs, sqrt_l/hs, hs, hs, d] + features = features.permute(0, 1, 3, 2, 4, 5) + # [n, sqrt_l/hs, sqrt_l/hs, hs*hs*d] + features = features.flatten(3) + # [n, sqrt_l/hs*sqrt_l/hs, hs*hs*d] + features = features.reshape( + n, -1, + self.config.hidden_stride * self.config.hidden_stride * d) + + return features + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + """[BatchSize, ImageShape] -> [BatchSize, Token, VocabSize]""" + features = self.encode(pixel_values) + logits, _ = self.head[0]( + features) # we spllit the sequncial here for not throwing an error + logits = self.head[1](logits) + tokens = self.tokenize(logits) + # tokens' shape is [BatchSize, #Token, VocabSize-5], so padding with + # [BatchSize, #Token, 5], after which, tokens' shape should become + # [BatchSize, #Token, VocabSize] + batch_size, token_len, _ = tokens.shape + padding_tensor = torch.zeros(size=(batch_size, token_len, + len(IMAGE_INDICATOR_IDS)), + dtype=tokens.dtype, + device=tokens.device, + layout=tokens.layout, + requires_grad=False) + tokens = torch.cat((tokens, padding_tensor), dim=2) + return tokens + + +class AIMv2SwiGLUFFN(nn.Module): + + def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig, + prefix: str): + super().__init__() + hidden_features = config.intermediate_size + in_features = config.hidden_size + bias = config.use_bias + + # TODO(Isotr0py): investigate if we can add TP to visual tokenizer + self.fc1 = ReplicatedLinear(in_features, + hidden_features, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.fc1") + self.fc2 = ReplicatedLinear(hidden_features, + in_features, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.fc2") + self.fc3 = ReplicatedLinear(in_features, + hidden_features, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.fc3") + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x_parallel, _ = self.fc1(x) + gate, _ = self.fc3(x) + x_parallel = F.silu(x_parallel) * gate + out, _ = self.fc2(x_parallel) + return out + + +class AIMv2PatchEmbed(nn.Module): + + def __init__(self, config: AIMv2Config): + super().__init__() + self.proj = nn.Conv2d( + config.num_channels, + config.hidden_size, + kernel_size=(config.patch_size, config.patch_size), + stride=(config.patch_size, config.patch_size), + ) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.proj(x).flatten(2).transpose(1, 2) + x = self.norm.forward_native(x) + return x + + +class AIMv2ViTPreprocessor(nn.Module): + + def __init__(self, config: AIMv2Config): + super().__init__() + num_patches = (config.image_size // config.patch_size)**2 + + self.patchifier = AIMv2PatchEmbed(config) + self.pos_embed = nn.Parameter( + torch.zeros((1, num_patches, config.hidden_size))) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + tokens = self.patchifier(x) + _, N, _ = tokens.shape + pos_embed = self.pos_embed.to(tokens.device) + tokens = tokens + pos_embed[:, :N] + return tokens + + +class AIMv2Attention(nn.Module): + + def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig, + prefix: str): + super().__init__() + dim = config.hidden_size + + # TODO(Isotr0py): investigate if we can add TP to visual tokenizer + self.num_heads = config.num_attention_heads + self.qkv = ReplicatedLinear(dim, dim * 3, bias=config.qkv_bias) + # self.qkv = QKVParallelLinear( + # hidden_size=dim, + # head_size=dim // config.num_attention_heads, + # total_num_heads=config.num_attention_heads, + # bias=config.qkv_bias, + # quant_config=quant_config, + # prefix=f"{prefix}.qkv") + self.proj = ReplicatedLinear(dim, dim, bias=config.use_bias) + # self.proj = RowParallelLinear(input_size=dim, + # output_size=dim, + # bias = config.use_bias, + # quant_config=quant_config, + # prefix=f"{prefix}.proj") + + def forward( # todo might implement multiple attn implementations + self, + x: torch.Tensor, + mask: Optional[torch.Tensor] = None) -> torch.Tensor: + B, N, C = x.shape + qkv, _ = self.qkv(x) + + qkv = qkv.reshape(B, N, 3, self.num_heads, + C // self.num_heads).permute(2, 0, 3, 1, 4) + + q, k, v = qkv.unbind(0) + + x = F.scaled_dot_product_attention(q, k, v, attn_mask=mask) + x = x.transpose(1, 2).contiguous().reshape(B, N, C) + x, _ = self.proj(x) + return x + + +class AIMv2Block(nn.Module): + + def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig, + prefix: str): + super().__init__() + self.attn = AIMv2Attention(config, + quant_config=quant_config, + prefix=f"{prefix}.attn") + self.norm_1 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.mlp = AIMv2SwiGLUFFN(config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") + self.norm_2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward(self, + x: torch.Tensor, + mask: Optional[torch.Tensor] = None) -> torch.Tensor: + x = x + self.attn(self.norm_1.forward_native(x), mask) + x = x + self.mlp(self.norm_2.forward_native(x)) + return x + + +class AIMv2Transformer(nn.Module): + + def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig, + prefix: str): + super().__init__() + + self.blocks = nn.ModuleList([ + AIMv2Block(config, quant_config, prefix=f"{prefix}.blocks.{i}") + for i in range(config.num_hidden_layers) + ]) + self.post_trunk_norm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, + tokens: torch.Tensor, + mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + # they take the -1 as the ref embeddings, like a clip skip + for block in self.blocks: + tokens = block(tokens, mask) + # NO NORM IN THE OG IMPLEMENTATION + # tokens = self.post_trunk_norm(tokens) + return tokens + + +class AIMv2Model(torch.nn.Module): + + def __init__(self, + config: AIMv2Config, + quant_config: QuantizationConfig, + prefix: str = ""): + super().__init__() + self.preprocessor = AIMv2ViTPreprocessor(config) + self.trunk = AIMv2Transformer(config, + quant_config=quant_config, + prefix=f"{prefix}.trunk") + + @property + def dtype(self): + return self.trunk.blocks[0].attn.qkv.weight.dtype + + @property + def device(self): + return self.trunk.blocks[0].attn.qkv.device + + def forward( + self, + pixel_values: torch.Tensor, + mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + + x = self.preprocessor(pixel_values) + x = self.trunk(x, mask) + + return x diff --git a/vllm/model_executor/models/ovis2.py b/vllm/model_executor/models/ovis2.py new file mode 100644 index 000000000..638077bc8 --- /dev/null +++ b/vllm/model_executor/models/ovis2.py @@ -0,0 +1,331 @@ +# SPDX-License-Identifier: Apache-2.0 + +# adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/ovis/modeling_ovis.py +# Copyright 2023 The vLLM team. +# Copyright 2023 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Ovis2 model.""" +from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, + TypedDict, Union) + +import torch +import torch.nn as nn +from torch import Tensor +from transformers import BatchFeature + +from vllm.config import VllmConfig +from vllm.model_executor.models.aimv2 import Aimv2VisualTokenizer +from vllm.model_executor.models.utils import (AutoWeightsLoader, flatten_bn, + init_vllm_registered_model, + maybe_prefix) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalKwargs) +from vllm.multimodal.parse import ImageSize, MultiModalDataItems +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, PromptReplacement) +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.configs.ovis2 import OvisConfig +from vllm.transformers_utils.processors.ovis2 import OvisProcessor + +from .interfaces import MultiModalEmbeddings, SupportsMultiModal +from .utils import merge_multimodal_embeddings + +# Cannot find the following number from hf config. +IMAGE_TOKEN = "" +IMAGE_ATOM_TOKEN_ID = 151666 +IMAGE_PAD_TOKEN_ID = 151672 +NUMBER_OF_TOKEN_TO_RESERVE_FOR_SEGMENT = 256 + + +class Ovis2ImagePatchInputs(TypedDict): + type: Literal["image_patches"] + flat_data: torch.Tensor + """ + Shape: + `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)` + """ + + patches_per_image: List[int] + """ + List of number of total patches for each image in the batch. + This is used to restore the first two dimensions of `flat_data`. + """ + + +class VisualEmbedding(torch.nn.Embedding): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def forward(self, visual_tokens: Tensor) -> Tensor: + if visual_tokens.dtype in [ + torch.int8, torch.int16, torch.int32, torch.int64, torch.long + ]: + return super().forward(visual_tokens) + return torch.matmul(visual_tokens, self.weight) + + @property + def device(self): + return self.weight.device + + @property + def dtype(self): + return self.weight.dtype + + +class Ovis2ProcessingInfo(BaseProcessingInfo): + + def get_hf_config(self): + return self.ctx.get_hf_config(OvisConfig) + + def get_hf_processor(self, **kwargs): + return self.ctx.get_hf_processor(OvisProcessor) + + def get_image_processor(self) -> OvisProcessor: + return self.get_hf_processor().image_processor # type: ignore + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return { # 32k is model token limit at the moment + "image": + self.get_hf_config().multimodal_max_length // + ((9 + 1) * NUMBER_OF_TOKEN_TO_RESERVE_FOR_SEGMENT) + } + + def get_image_size_with_most_features(self) -> ImageSize: + image_processor = self.get_image_processor() + return ImageSize(width=image_processor.size['shortest_edge'] * 9 * 2, + height=image_processor.size['shortest_edge'] * 9 * 2) + + +class Ovis2DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2ProcessingInfo]): + + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + return IMAGE_TOKEN * num_images + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + num_images = mm_counts.get("image", 0) + + target_width, target_height = \ + self.info.get_image_size_with_most_features() + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images), + } + return mm_data + + +class Ovis2MultiModalProcessor(BaseMultiModalProcessor[Ovis2ProcessingInfo]): + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + if not mm_data: + # # Avoid warning from HF logger for text-only input + prompt_ids = self.info.get_tokenizer().encode(prompt) + # prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids) nope + return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") + + processed_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + ) + + return processed_outputs + + def _apply_hf_processor_tokens_only( + self, + prompt_tokens: list[int], + ) -> list[int]: + + return prompt_tokens + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict(pixel_values=MultiModalFieldConfig.batched("image"), + grids=MultiModalFieldConfig.batched("image")) + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + + def get_replacement_ovis(item_idx): + grid = out_mm_kwargs["grids"][item_idx] + + hf_processor = self.info.get_hf_processor() + return hf_processor.construct_image_placeholders(grid) + + return [ + PromptReplacement( + modality="image", + target=IMAGE_TOKEN, + replacement=get_replacement_ovis, + ), + ] + + +@MULTIMODAL_REGISTRY.register_processor(Ovis2MultiModalProcessor, + info=Ovis2ProcessingInfo, + dummy_inputs=Ovis2DummyInputsBuilder) +class Ovis2ForConditionalGeneration(nn.Module, SupportsMultiModal): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + + self.config: OvisConfig = config + self.llm = init_vllm_registered_model( + vllm_config=vllm_config.with_hf_config(config.get_text_config()), + prefix=maybe_prefix(prefix, "llm"), + ) + + self.visual_tokenizer = Aimv2VisualTokenizer( + config=config.visual_tokenizer_config, + quant_config=quant_config, + prefix=f"{prefix}.visual_tokenizer", + image_processor_name_or_path=config.visual_tokenizer_config. + backbone_config.name_or_path, + ) + + self.vte = VisualEmbedding( + self.config.visual_tokenizer_config.vocab_size, + self.config.hidden_size) + + # TODO(Isotr0py): PP support + # self.make_empty_intermediate_tensors = ( + # self.language_model.make_empty_intermediate_tensors) + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[Ovis2ImagePatchInputs]: + pixel_values = kwargs.pop("pixel_values", None) + if pixel_values is None: + return None + + if pixel_values is not None: + if not isinstance(pixel_values, (torch.Tensor, list)): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") + + return Ovis2ImagePatchInputs( + type="image_patches", + flat_data=flatten_bn(flatten_bn(pixel_values), concat=True), + patches_per_image=[ + x.shape[0] for x in flatten_bn(pixel_values) + ], + ) + + raise AssertionError("This line should be unreachable.") + + def _process_image_input( + self, image_input: Ovis2ImagePatchInputs) -> MultiModalEmbeddings: + image_patches_flat = image_input["flat_data"] + patches_per_image = image_input["patches_per_image"] + + target_dtype = self.visual_tokenizer.dtype + visual_tokens = self.visual_tokenizer( + image_patches_flat.to(target_dtype)) + visual_embeds = self.vte(visual_tokens) # 1:1 numeric eq. + + return tuple( + x.flatten(0, 1) + for x in visual_embeds.split(patches_per_image, dim=0)) + + def get_multimodal_embeddings( + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + + image_features = self._process_image_input(image_input) + + return image_features + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + ) -> torch.Tensor: + inputs_embeds = self.llm.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + [IMAGE_ATOM_TOKEN_ID, IMAGE_PAD_TOKEN_ID]) + return inputs_embeds + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + if intermediate_tensors is not None: + inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None + + # up until here we have a inputs_embeds 100% numerical identity + # between the OG HF Transformers implementation and ours + hidden_states = self.llm( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + ) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.llm.logits_processor(self.llm.lm_head, hidden_states, + sampling_metadata) + return logits + + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) + + def get_language_model(self) -> torch.nn.Module: + return self.llm diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index df5b23232..156a201de 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -195,6 +195,7 @@ _MULTIMODAL_MODELS = { "Mistral3ForConditionalGeneration": ("mistral3", "Mistral3ForConditionalGeneration"), # noqa: E501 "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"), "NVLM_D": ("nvlm_d", "NVLM_D_Model"), + "Ovis2ForConditionalGeneration": ("ovis2", "Ovis2ForConditionalGeneration"), "PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"), # noqa: E501 "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"), # noqa: E501 diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 5ddfadb02..f6c2b3553 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -38,9 +38,9 @@ from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config, MiniMaxVL01Config, MllamaConfig, MLPSpeculatorConfig, MPTConfig, NemotronConfig, NVLM_D_Config, - RWConfig, SkyworkR1VChatConfig, - SolarConfig, Telechat2Config, - UltravoxConfig) + OvisConfig, RWConfig, + SkyworkR1VChatConfig, SolarConfig, + Telechat2Config, UltravoxConfig) # yapf: enable from vllm.transformers_utils.utils import check_gguf_file from vllm.utils import resolve_obj_by_qualname @@ -79,6 +79,7 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { "minimax_vl_01": MiniMaxVL01Config, "nemotron": NemotronConfig, "NVLM_D": NVLM_D_Config, + "ovis": OvisConfig, "solar": SolarConfig, "skywork_chat": SkyworkR1VChatConfig, "telechat": Telechat2Config, diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 8945c45ea..db3efafee 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -23,6 +23,7 @@ from vllm.transformers_utils.configs.moonvit import MoonViTConfig from vllm.transformers_utils.configs.mpt import MPTConfig from vllm.transformers_utils.configs.nemotron import NemotronConfig from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config +from vllm.transformers_utils.configs.ovis2 import OvisConfig from vllm.transformers_utils.configs.skyworkr1v import SkyworkR1VChatConfig from vllm.transformers_utils.configs.solar import SolarConfig from vllm.transformers_utils.configs.telechat2 import Telechat2Config @@ -49,6 +50,7 @@ __all__ = [ "KimiVLConfig", "NemotronConfig", "NVLM_D_Config", + "OvisConfig", "SkyworkR1VChatConfig", "SolarConfig", "Telechat2Config", diff --git a/vllm/transformers_utils/configs/ovis2.py b/vllm/transformers_utils/configs/ovis2.py new file mode 100644 index 000000000..437a16e77 --- /dev/null +++ b/vllm/transformers_utils/configs/ovis2.py @@ -0,0 +1,170 @@ +# SPDX-License-Identifier: Apache-2.0 + +# yapf: disable +# ruff: noqa: E501 +# copied from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py +# and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py +from typing import Any, Optional, Union + +from transformers import AutoConfig, PretrainedConfig + + +class AIMv2Config(PretrainedConfig): + """This is the configuration class to store the configuration of an [`AIMv2Model`]. + + Instantiating a configuration with the defaults will yield a similar configuration + to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224). + + Args: + hidden_size: Dimension of the hidden representations. + intermediate_size: Dimension of the SwiGLU representations. + num_hidden_layers: Number of hidden layers in the Transformer. + num_attention_heads: Number of attention heads for each attention layer + in the Transformer. + num_channels: Number of input channels. + image_size: Image size. + patch_size: Patch size. + rms_norm_eps: Epsilon value used for the RMS normalization layer. + attention_dropout: Dropout ratio for attention probabilities. + projection_dropout: Dropout ratio for the projection layer after the attention. + qkv_bias: Whether to add a bias to the queries, keys and values. + use_bias: Whether to add a bias in the feed-forward and projection layers. + kwargs: Keyword arguments for the [`PretrainedConfig`]. + """ + + model_type: str = "aimv2" + + def __init__( + self, + hidden_size: int = 1024, + intermediate_size: int = 2816, + num_hidden_layers: int = 24, + num_attention_heads: int = 8, + num_channels: int = 3, + image_size: int = 224, + patch_size: int = 14, + rms_norm_eps: float = 1e-5, + attention_dropout: float = 0.0, + projection_dropout: float = 0.0, + qkv_bias: bool = False, + use_bias: bool = False, + **kwargs: Any, + ): + super().__init__(**kwargs) + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.attention_dropout = attention_dropout + self.rms_norm_eps = rms_norm_eps + + self.projection_dropout = projection_dropout + self.qkv_bias = qkv_bias + self.use_bias = use_bias + + +IGNORE_ID = -100 +IMAGE_TOKEN_ID = -200 +IMAGE_TOKEN = "" +IMAGE_ATOM_ID = -300 +IMAGE_INDICATOR_IDS = [-301, -302, -303, -304, -305] + +AutoConfig.register("aimv2", AIMv2Config) + + +# ---------------------------------------------------------------------- +# Visual Tokenizer Configuration +# ---------------------------------------------------------------------- +class BaseVisualTokenizerConfig(PretrainedConfig): + + def __init__(self, + vocab_size=16384, + tokenize_function="softmax", + tau=1.0, + depths=None, + drop_cls_token=False, + backbone_config: Optional[Union[PretrainedConfig, + dict]] = None, + hidden_stride: int = 1, + **kwargs): + super().__init__(**kwargs) + self.vocab_size = vocab_size + self.tokenize_function = tokenize_function + self.tau = tau + if isinstance(depths, str): + depths = [int(x) for x in depths.split('|')] + self.depths = depths + self.backbone_kwargs = dict[str, Any]() + self.drop_cls_token = drop_cls_token + if backbone_config is not None: + assert isinstance(backbone_config, (PretrainedConfig, dict)), \ + f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type" + if not isinstance(backbone_config, PretrainedConfig): + model_type = backbone_config['model_type'] + backbone_config.pop('model_type') + backbone_config = AutoConfig.for_model(model_type, + **backbone_config) + self.backbone_config = backbone_config + self.hidden_stride = hidden_stride + + +class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig): + model_type = "aimv2_visual_tokenizer" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + if self.drop_cls_token: + self.drop_cls_token = False + if self.depths: + assert len(self.depths) == 1 + self.backbone_kwargs['num_hidden_layers'] = self.depths[0] + + +AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig) + + +# ---------------------------------------------------------------------- +# Ovis Configuration +# ---------------------------------------------------------------------- +class OvisConfig(PretrainedConfig): + model_type = "ovis" + + def __init__(self, + llm_config: Optional[Union[PretrainedConfig, dict]] = None, + visual_tokenizer_config: Optional[Union[PretrainedConfig, + dict]] = None, + multimodal_max_length=8192, + hidden_size=None, + conversation_formatter_class=None, + llm_attn_implementation=None, + disable_tie_weight=False, + **kwargs): + super().__init__(**kwargs) + if llm_config is not None: + assert isinstance(llm_config, (PretrainedConfig, dict)), \ + f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type" + if not isinstance(llm_config, PretrainedConfig): + model_type = llm_config['model_type'] + llm_config.pop('model_type') + llm_config = AutoConfig.for_model(model_type, **llm_config) + + # map llm_config to text_config + self.text_config = llm_config + if visual_tokenizer_config is not None: + assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \ + f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type" + if not isinstance(visual_tokenizer_config, PretrainedConfig): + model_type = visual_tokenizer_config['model_type'] + visual_tokenizer_config.pop('model_type') + visual_tokenizer_config = AutoConfig.for_model( + model_type, **visual_tokenizer_config) + + self.visual_tokenizer_config = visual_tokenizer_config + self.multimodal_max_length = multimodal_max_length + self.hidden_size = hidden_size + self.conversation_formatter_class = conversation_formatter_class + self.llm_attn_implementation = llm_attn_implementation + self.disable_tie_weight = disable_tie_weight diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py index 4696f0c49..2e9cf3e4d 100644 --- a/vllm/transformers_utils/processors/__init__.py +++ b/vllm/transformers_utils/processors/__init__.py @@ -2,5 +2,6 @@ from vllm.transformers_utils.processors.deepseek_vl2 import ( DeepseekVLV2Processor) +from vllm.transformers_utils.processors.ovis2 import OvisProcessor -__all__ = ["DeepseekVLV2Processor"] +__all__ = ["DeepseekVLV2Processor", "OvisProcessor"] diff --git a/vllm/transformers_utils/processors/ovis2.py b/vllm/transformers_utils/processors/ovis2.py new file mode 100644 index 000000000..fa5bdd40e --- /dev/null +++ b/vllm/transformers_utils/processors/ovis2.py @@ -0,0 +1,397 @@ +# SPDX-License-Identifier: Apache-2.0 + +# yapf: disable +# ruff: noqa: E501 +# coding=utf-8 +# adapted from https://github.com/AIDC-AI/Ovis/blob/35ab51a1a1e3542fa6db260a1084cefbc8f164bb/ovis/vllm/processing_ovis.py +# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import List, Union + +import PIL +import torch +from transformers import AutoProcessor, BatchFeature +from transformers.image_utils import ImageInput +from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin, + Unpack) +from transformers.tokenization_utils_base import PreTokenizedInput, TextInput + +__all__ = [ 'OvisProcessor'] +IGNORE_ID = -100 + +class OvisProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg] + _defaults = { + "text_kwargs": { + "padding": False, + }, + "images_kwargs": { + 'max_partition':9, + 'covering_threshold':0.9, + 'convert_to_rgb':True, + 'return_tensors':'pt'}, + } + + + +class OvisProcessor(ProcessorMixin): + r""" + Constructs a Ovis processor which wraps a Ovis image processor and a Qwen2 tokenizer into a single processor. + [`OvisProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the + [`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`] for more information. + Args: + image_processor ([`Qwen2VLImageProcessor`], *optional*): + The image processor is a required input. + tokenizer ([`Qwen2TokenizerFast`], *optional*): + The tokenizer is a required input. + chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + """ + + attributes = ["image_processor", "tokenizer"] + valid_kwargs = ["chat_template"] + + image_processor_class = "AutoImageProcessor" + tokenizer_class = "Qwen2Tokenizer" + + def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): + self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token + self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + self.extra_special_tokens = { + "image_token": "", + "image_atom": "", + "image_start": "", + "image_prefix": "
",
+            "image_col_sep": "",
+            "image_row_sep": "",
+            "image_end": "",
+            'image_pad': '',
+        }
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        **kwargs: Unpack[OvisProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
+        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
+            Args:
+                images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                    The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                    tensor. Both channels-first and channels-last formats are supported.
+                text (`str`, `List[str]`, `List[List[str]]`):
+                    The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                    (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                    `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+                videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                    The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
+                    tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
+                return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                    If set, will return tensors of a particular framework. Acceptable values are:
+                    - `'tf'`: Return TensorFlow `tf.constant` objects.
+                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                    - `'np'`: Return NumPy `np.ndarray` objects.
+                    - `'jax'`: Return JAX `jnp.ndarray` objects.
+            Returns:
+                [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+                - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+                - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+                  `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+                  `None`).
+                - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+                - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
+                - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
+                - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
+                - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            OvisProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        # Process all images first
+        image_features = {}
+        if images is not None:
+            processed_images = []
+            image_placeholders_list = []
+            grids = []
+
+            # Process each image
+            for image in images if isinstance(images, list) else [images]:
+                pixel_values, image_placeholders, grid = self.preprocess_image(
+                    image=image, **output_kwargs["images_kwargs"]
+                )
+                processed_images.append(pixel_values)
+                image_placeholders_list.append(image_placeholders)
+                grids.append(grid)
+
+            # assign all processed images
+            if processed_images:
+                image_features["image_placeholders"] = image_placeholders_list
+
+        # Process text input
+        if text is not None:
+
+            if not isinstance(text, list):
+                text = [text]
+
+            tokenized_batched_text = self.tokenizer.batch_encode_plus(
+                text,
+                **output_kwargs["text_kwargs"]
+            )
+            image_token_id = self.get_token_value("image_token")
+            replaced_ids_list = []
+            replaced_attn_mask_list = []
+            idx = 0
+            for ids_tensor, attn_mask in zip(tokenized_batched_text['input_ids'],
+                                             tokenized_batched_text['attention_mask']):
+                if image_token_id in ids_tensor and "image_placeholders" in image_features:
+                    if idx < len(image_features["image_placeholders"]):
+                        # Converts in list for ease of use
+                        ids_list = ids_tensor.tolist()
+                        attn_list = attn_mask.tolist()
+
+                        new_ids = []
+                        new_attn = []
+
+                        # replace placeholders
+                        for i, token_id in enumerate(ids_list):
+                            if token_id == image_token_id:
+                                placeholder_ids = image_features["image_placeholders"][idx]
+                                new_ids.extend(placeholder_ids)
+                                new_attn.extend([1] * len(placeholder_ids))
+                                idx += 1
+                            else:
+                                new_ids.append(token_id)
+                                new_attn.append(attn_list[i])
+
+                        # Converts back to tensors
+                        ids_tensor = torch.tensor(new_ids, dtype=torch.long)
+                        attn_mask = torch.tensor(new_attn, dtype=torch.long)
+                    else:
+                        raise RuntimeError(
+                            'Mismatch between the images you provided and the number of placeholder present in the text')
+
+                replaced_ids_list.append(ids_tensor)
+                replaced_attn_mask_list.append(attn_mask)
+
+            if replaced_ids_list:
+                replaced_and_tokenized_ids = torch.stack(replaced_ids_list)
+                replaced_and_tokenized_attn_mask = torch.stack(replaced_attn_mask_list)
+            else:
+                replaced_and_tokenized_ids = torch.tensor([], dtype=torch.long)
+                replaced_and_tokenized_attn_mask = torch.tensor([], dtype=torch.long)
+
+            # Create the output with text features
+            output = BatchFeature(
+                data={
+                    "input_ids": replaced_and_tokenized_ids,
+                    "attention_mask": replaced_and_tokenized_attn_mask,
+                }
+            )
+
+            # Add image features if present
+            if image_features:
+                output["pixel_values"] = processed_images
+                output['grids'] = grids
+
+            return output
+
+
+        # If only images were provided
+        return BatchFeature(data=image_features)
+
+
+    def get_image_size(self):
+        height = self.image_processor.crop_size["height"]
+        width = self.image_processor.crop_size["width"]
+        return height, width
+
+    def get_token_value(self, tok):
+        return self.tokenizer.get_vocab()[self.extra_special_tokens[tok]]
+
+    def construct_image_placeholders(self, grid):
+
+        image_placeholders = [self.get_token_value('image_start'),
+                              self.get_token_value('image_atom'),
+                              self.get_token_value('image_prefix')]
+        if grid[0] * grid[1] > 1:
+            for r in range(grid[0]):
+                for c in range(grid[1]):
+                    image_placeholders.append(self.get_token_value('image_atom') )
+                    if c < grid[1] - 1:
+                        image_placeholders.append(self.get_token_value('image_col_sep'))
+                if r < grid[0] - 1:
+                    image_placeholders.append(self.get_token_value('image_row_sep'))
+        image_placeholders.append(self.get_token_value('image_end'))
+        # return image_placeholders
+
+        image_atom_token_id = self.get_token_value('image_atom')
+        # Extract the padding token ID from tokenizer
+        image_padding_token_id = self.get_token_value('image_pad')
+
+        # Create a new list with padding tokens inserted
+        padded_placeholder_tokens = []
+        for token in image_placeholders:
+            padded_placeholder_tokens.append(token)
+            if token == image_atom_token_id:
+                # Add 255 padding tokens after each image atom token
+                padded_placeholder_tokens.extend([image_padding_token_id] * 255)
+        return padded_placeholder_tokens
+
+    def preprocess_image(self, image: PIL.Image.Image, max_partition, covering_threshold, convert_to_rgb, return_tensors):
+        def _preprocess(img: PIL.Image.Image, side):
+            # first resize and preprocess
+            w, h = img.size
+            if w == h:
+                new_width = new_height = side
+            elif w > h:
+                new_width = side
+                new_height = int(h / w * new_width)
+            else:
+                new_height = side
+                new_width = int(w / h * new_height)
+            new_size = dict(height=new_height, width=new_width)
+            pixel_values = self.image_processor.preprocess(img, size=new_size, return_tensors=return_tensors)['pixel_values']
+
+            # then pad to square
+            square_values = torch.zeros([1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device)
+            new_height, new_width = pixel_values.shape[2:]
+            if new_height == new_width:
+                square_values[:, :, :, :] = pixel_values
+            elif new_height > new_width:
+                from_index = (side - new_width) // 2
+                square_values[:, :, :, from_index:from_index + new_width] = pixel_values
+            else:
+                from_index = (side - new_height) // 2
+                square_values[:, :, from_index:from_index + new_height, :] = pixel_values
+
+            return square_values
+
+        def _partition(img, grid) -> list[tuple[int, int, int, int]]:
+            w, h = img.size
+            row_height = h // grid[0]
+            col_width = w // grid[1]
+
+            partition = []
+            for row in range(grid[0]):
+                for col in range(grid[1]):
+                    left = col * col_width
+                    upper = row * row_height
+                    right = w if col == grid[1] - 1 else (col + 1) * col_width
+                    lower = h if row == grid[0] - 1 else (row + 1) * row_height
+                    partition.append((left, upper, right, lower))
+
+            return partition
+
+        def _covering_area(left, upper, right, lower, side):
+            w = right - left
+            h = lower - upper
+            w, h = max(w, h), min(w, h)
+            if w > side:
+                h = h / w * side
+                w = side
+            return w * h
+
+        def _get_best_grid(img, side):
+            img_area = img.size[0] * img.size[1]
+
+            candidate_grids = []
+            for i in range(1, max_partition + 1):
+                for j in range(1, max_partition + 1):
+                    if i * j <= max_partition:
+                        candidate_grids.append((i, j))
+
+            all_grids = []
+            good_grids = []
+            for grid in candidate_grids:
+                partition = _partition(img, grid)
+                covering_ratio = sum([_covering_area(*p, side) for p in partition]) / img_area
+                assert covering_ratio <= 1.0
+                all_grids.append((grid, covering_ratio))
+                if covering_ratio > covering_threshold:
+                    good_grids.append((grid, covering_ratio))
+
+            if len(good_grids) > 0:
+                # pick the good partition with minimum #sub_images and break the tie using covering_ratio
+                return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][0]
+            else:
+                # pick the partition with maximum covering_ratio and break the tie using #sub_images
+                return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0]
+
+        if convert_to_rgb and image.mode != 'RGB':
+            image = image.convert('RGB')
+
+
+        sides = self.get_image_size()
+        if sides[0] != sides[1]:
+            raise ValueError('get_image_size() returns non-square size')
+        side = sides[0]
+        grid = _get_best_grid(image, side)
+        partition = _partition(image, grid)
+        crops = [image.crop(p) for p in partition]
+        if len(crops) > 1:
+            crops.insert(0, image)
+        pixel_values = torch.cat([_preprocess(crop, side) for crop in crops], dim=0)
+        image_placeholders = self.construct_image_placeholders(grid)
+        return pixel_values, image_placeholders, grid
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    def post_process_image_text_to_text(self, generated_outputs):
+        """
+        Post-process the output of the model to decode the text.
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                or `(sequence_length,)`.
+        Returns:
+            `List[str]`: The decoded text.
+        """
+        return self.tokenizer.batch_decode(
+            generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+        return names_from_processor + ["second_per_grid_ts"]
+
+
+AutoProcessor.register("OvisProcessor", OvisProcessor)
\ No newline at end of file
-- 
GitLab


From ece5a8b0b6657b8c46b20daba2869762d1765008 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Wed, 30 Apr 2025 00:52:48 -0700
Subject: [PATCH 054/461] Make the _apply_rotary_emb compatible with dynamo
 (#17435)

---
 vllm/model_executor/layers/rotary_embedding.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 00f4e66bd..b179a0f00 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -32,6 +32,9 @@ from transformers import PretrainedConfig
 from vllm.model_executor.custom_op import CustomOp
 from vllm.platforms import current_platform
 
+if current_platform.is_cuda_alike():
+    from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
+
 
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
     x1 = x[..., :x.shape[-1] // 2]
@@ -78,7 +81,6 @@ def _apply_rotary_emb(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor,
             positional embeddings.
     """
     if current_platform.is_cuda_alike():
-        from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
         return apply_rotary_emb(x.unsqueeze(0), cos, sin,
                                 not is_neox_style).squeeze(0)
     else:
-- 
GitLab


From 1534d389af44a779d6b8b16f8574a50a2ab06f38 Mon Sep 17 00:00:00 2001
From: Chauncey 
Date: Wed, 30 Apr 2025 16:52:19 +0800
Subject: [PATCH 055/461] [Misc] Remove deprecated files (#17447)

Signed-off-by: chaunceyjiang 
---
 .../guided_decoding/reasoner/__init__.py      | 35 -------------------
 1 file changed, 35 deletions(-)
 delete mode 100644 vllm/model_executor/guided_decoding/reasoner/__init__.py

diff --git a/vllm/model_executor/guided_decoding/reasoner/__init__.py b/vllm/model_executor/guided_decoding/reasoner/__init__.py
deleted file mode 100644
index ab6e47c00..000000000
--- a/vllm/model_executor/guided_decoding/reasoner/__init__.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from __future__ import annotations
-
-from transformers import PreTrainedTokenizer
-
-from vllm.logger import init_logger
-from vllm.model_executor.guided_decoding.reasoner.deepseek_reasoner import (  # noqa: E501
-    DeepSeekReasoner)
-from vllm.model_executor.guided_decoding.reasoner.reasoner import Reasoner
-
-logger = init_logger(__name__)
-
-
-def get_reasoner(tokenizer: PreTrainedTokenizer,
-                 reasoning_backend: str | None) -> Reasoner | None:
-    if reasoning_backend is None:
-        # No reasoning backend specified
-        return None
-    elif reasoning_backend == "deepseek_r1":
-        return DeepSeekReasoner.from_tokenizer(tokenizer)
-    elif reasoning_backend == "granite":
-        logger.warning(
-            "Granite reasoner not yet implemented for structured outputs")
-        return None
-    else:
-        # Raise a warning for unknown reasoning backend and return None
-        # We cannot raise an error here because some reasoning models
-        # may not have a corresponding Reasoner class.
-        logger.warning("Unknown reasoning backend %s for structured outputs ",
-                       reasoning_backend)
-        return None
-
-
-__all__ = ["Reasoner", "get_reasoner"]
-- 
GitLab


From d8037867313eef09779b265ea4bace76d2d118ba Mon Sep 17 00:00:00 2001
From: "rongfu.leng" 
Date: Wed, 30 Apr 2025 18:20:39 +0800
Subject: [PATCH 056/461] [V1][Bugfix]: vllm v1 verison metric num_gpu_blocks
 is None (#15755)

Signed-off-by: rongfu.leng 
---
 vllm/v1/engine/async_llm.py   |  3 ++-
 vllm/v1/engine/core.py        |  9 ++++++++-
 vllm/v1/engine/core_client.py | 13 ++++++++++---
 vllm/v1/metrics/loggers.py    | 23 +++++++++++++++++------
 4 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 2562fcc9c..14ce820cc 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -120,7 +120,8 @@ class AsyncLLM(EngineClient):
             executor_class=executor_class,
             log_stats=self.log_stats,
         )
-
+        for stat_logger in self.stat_loggers[0]:
+            stat_logger.log_engine_initialized()
         self.output_handler: Optional[asyncio.Task] = None
         try:
             # Start output handler eagerly if we are in the asyncio eventloop.
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 80807665e..5912318f1 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+import json
 import os
 import queue
 import signal
@@ -116,6 +117,7 @@ class EngineCore:
             logger.info("Batch queue is enabled with size %d",
                         self.batch_queue_size)
             self.batch_queue = queue.Queue(self.batch_queue_size)
+        self.vllm_config = vllm_config
 
     def _initialize_kv_caches(
             self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]:
@@ -507,7 +509,12 @@ class EngineCoreProc(EngineCore):
                             bind=False) as socket:
 
             # Send ready message to front-end once input socket is connected.
-            socket.send(b'READY')
+            message_dict = {
+                'type': 'READY',
+                'num_gpu_blocks': self.vllm_config.cache_config.num_gpu_blocks,
+            }
+            message = json.dumps(message_dict).encode('utf-8')
+            socket.send(message)
 
             while True:
                 # (RequestType, RequestData)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index dd5190996..0d5d92f72 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 import asyncio
 import contextlib
+import json
 import queue
 import uuid
 import weakref
@@ -362,6 +363,7 @@ class MPClient(EngineCoreClient):
         executor_class: type[Executor],
         log_stats: bool,
     ):
+        self.vllm_config = vllm_config
         # Serialization setup.
         self.encoder = MsgpackEncoder()
         self.decoder = MsgpackDecoder(EngineCoreOutputs)
@@ -430,14 +432,19 @@ class MPClient(EngineCoreClient):
                 raise RuntimeError("Engine core initialization failed. "
                                    "See root cause above.")
 
-            eng_id_bytes, msg = sync_input_socket.recv_multipart()
+            eng_id_bytes, data = sync_input_socket.recv_multipart()
             eng_id = int.from_bytes(eng_id_bytes, byteorder="little")
             if eng_id not in identities:
                 raise RuntimeError(f"Unexpected or duplicate engine: {eng_id}")
-            if msg != b'READY':
-                raise RuntimeError(f"Engine {eng_id} failed: {msg.decode()}")
+            message_dict = json.loads(data.decode('utf-8'))
+            if message_dict['type'] != 'READY':
+                raise RuntimeError(f"Engine {eng_id} failed: {data.decode()}")
             logger.info("Core engine process %d ready.", eng_id)
             identities.discard(eng_id)
+            # Setup KV cache config with initialization state from
+            # engine core process.
+            self.vllm_config.cache_config.num_gpu_blocks = message_dict[
+                'num_gpu_blocks']
 
     def _init_core_engines(
         self,
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 7051c681b..9109bdcf4 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -39,6 +39,10 @@ class StatLoggerBase(ABC):
                iteration_stats: Optional[IterationStats]):
         ...
 
+    @abstractmethod
+    def log_engine_initialized(self):
+        ...
+
     def log(self):  # noqa
         pass
 
@@ -47,6 +51,7 @@ class LoggingStatLogger(StatLoggerBase):
 
     def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
         self.engine_index = engine_index
+        self.vllm_config = vllm_config
         self._reset(time.monotonic())
         self.last_scheduler_stats = SchedulerStats()
         # Prefix cache metrics. This cannot be reset.
@@ -127,12 +132,19 @@ class LoggingStatLogger(StatLoggerBase):
         if scheduler_stats.spec_decoding_stats is not None:
             self.spec_decoding_logging.log(log_fn=log_fn)
 
+    def log_engine_initialized(self):
+        logger.info(
+            "vllm cache_config_info with initialization " \
+            "after num_gpu_blocks is: %d",
+            self.vllm_config.cache_config.num_gpu_blocks)
+
 
 class PrometheusStatLogger(StatLoggerBase):
 
     def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
         self._unregister_vllm_metrics()
-
+        self.vllm_config = vllm_config
+        self.engine_index = engine_index
         # Use this flag to hide metrics that were deprecated in
         # a previous release and which will be removed future
         self.show_hidden_metrics = \
@@ -342,13 +354,9 @@ class PrometheusStatLogger(StatLoggerBase):
                         self.labelname_running_lora_adapters,
                     ])
 
-        #
-        # Cache config info metric
-        #
-        self.log_metrics_info("cache_config", vllm_config.cache_config)
-
     def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):
         metrics_info = config_obj.metrics_info()
+        metrics_info["engine"] = self.engine_index
 
         name, documentation = None, None
         if type == "cache_config":
@@ -442,6 +450,9 @@ class PrometheusStatLogger(StatLoggerBase):
             if hasattr(collector, "_name") and "vllm" in collector._name:
                 prometheus_client.REGISTRY.unregister(collector)
 
+    def log_engine_initialized(self):
+        self.log_metrics_info("cache_config", self.vllm_config.cache_config)
+
 
 def build_buckets(mantissa_lst: list[int], max_value: int) -> list[int]:
     """
-- 
GitLab


From a7d5b016bd0a882767dc1c3cc1537dc8c93a2ea7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= 
Date: Wed, 30 Apr 2025 13:03:22 +0200
Subject: [PATCH 057/461] [TPU][V1][CI] Update regression test baseline for v6
 CI (#17064)

Signed-off-by: NickLucche 
---
 tests/v1/tpu/test_perf.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/tests/v1/tpu/test_perf.py b/tests/v1/tpu/test_perf.py
index 94a1da88a..811833f73 100644
--- a/tests/v1/tpu/test_perf.py
+++ b/tests/v1/tpu/test_perf.py
@@ -59,17 +59,16 @@ TEST_PARAMS = [
         prefix_len=500,
         decode_len=50,
 
-        # (This is the active CI/CD instance)
         # commit id: ccb246776d93ef105904a8ec015b3587240a1183
-        # tpu: v5lite (vllm CI/CD)
-        expected_avg_time=1.4,
-        err_tol=0.30,
+        # tpu: v5lite (old vllm CI/CD)
+        # expected_avg_time=1.4,
+        # err_tol=0.30,
 
-        # (TODO: There is no v6e in CI/CD currently)
+        # (This is the active CI/CD instance)
         # commit id: ccb246776d93ef105904a8ec015b3587240a1183
-        # tpu: v6e
-        # expected_avg_time=1.5,
-        # err_tol=0.20,
+        # tpu: v6e (current vllm CI/CD)
+        expected_avg_time=1.7,  # measured with VLLM_XLA_CACHE_PATH=  
+        err_tol=0.20,
     ),
 ]
 
-- 
GitLab


From 77073c77bc2006eb80ea6d5128f076f5e6c6f54f Mon Sep 17 00:00:00 2001
From: Marko Rosenmueller <5467316+dr75@users.noreply.github.com>
Date: Wed, 30 Apr 2025 14:27:21 +0200
Subject: [PATCH 058/461] [Core] Prevent side-channel attacks via cache salting
 (#17045)

Signed-off-by: Marko Rosenmueller <5467316+dr75@users.noreply.github.com>
---
 docs/source/design/v1/prefix_caching.md       |  20 +-
 tests/entrypoints/openai/test_serving_chat.py |  40 ++++
 tests/tokenization/test_detokenize.py         |  12 +-
 tests/v1/core/test_kv_cache_utils.py          |  43 ++++-
 tests/v1/core/test_prefix_caching.py          |  64 ++++++-
 tests/v1/engine/test_engine_core.py           |   1 +
 tests/v1/engine/test_engine_core_client.py    |   1 +
 tests/v1/engine/test_output_processor.py      |   7 +-
 vllm/entrypoints/openai/protocol.py           |  32 +++-
 vllm/entrypoints/openai/serving_engine.py     |   3 +
 vllm/inputs/data.py                           |  20 +-
 vllm/inputs/preprocess.py                     | 178 +++++++-----------
 vllm/multimodal/inputs.py                     |   5 +
 vllm/multimodal/processing.py                 |   2 +-
 vllm/v1/core/kv_cache_utils.py                |  13 +-
 vllm/v1/engine/__init__.py                    |   1 +
 vllm/v1/engine/processor.py                   |   1 +
 vllm/v1/request.py                            |   3 +
 18 files changed, 324 insertions(+), 122 deletions(-)

diff --git a/docs/source/design/v1/prefix_caching.md b/docs/source/design/v1/prefix_caching.md
index ec1f3cb8d..ec661d8ec 100644
--- a/docs/source/design/v1/prefix_caching.md
+++ b/docs/source/design/v1/prefix_caching.md
@@ -16,7 +16,7 @@ In the example above, the KV cache in the first block can be uniquely identified
 
 * Parent hash value: The hash value of the parent hash block.
 * Block tokens: A tuple of tokens in this block. The reason to include the exact tokens is to reduce potential hash value collision.
-* Extra hashes: Other values required to make this block unique, such as LoRA IDs and multi-modality input hashes (see the example below).
+* Extra hashes: Other values required to make this block unique, such as LoRA IDs, multi-modality input hashes (see the example below), and cache salts to isolate caches in multi-tenant environments.
 
 > **Note 1:** We only cache full blocks.
 
@@ -76,6 +76,24 @@ Block 3
 
 In the rest of this document, we first introduce the data structure used for prefix caching in vLLM v1, followed by the prefix caching workflow of major KV cache operators (e.g., allocate, append, free, eviction). Finally, we use an example to illustrate the end to end prefix caching workflow.
 
+**Cache Isolation for Security**
+To improve privacy in shared environments, vLLM supports isolating prefix cache reuse through optional per-request salting. By including a `cache_salt` in the request, this value is injected into the hash of the first block, ensuring that only requests with the same salt can reuse cached KV blocks. This prevents timing-based attacks where an adversary could infer cached content by observing latency differences. This offers protection without compromising performance.
+
+```json
+{
+  "messages": [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "Here is a document with details about the world series: ..."},
+    {"role": "user", "content": "Who won the world series in 2020?"}
+  ],
+  "cache_salt": "Z3V2bmV3aGxza3ZubGFoZ3Zud3V3ZWZ2bmd0b3V2bnZmc2xpZ3RoZ2x2aQ=="
+}
+```
+
+With this setup, cache sharing is limited to users or requests that explicitly agree on a common salt, enabling cache reuse within a trust group while isolating others.
+
+> **Note:** Cache isolation is not supported in engine V0.
+
 ## Data Structure
 
 The prefix caching in vLLM v1 is implemented in the KV cache manager. The basic building block is the “Block” data class (simplified):
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 19d16713b..5e11af8cf 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -272,3 +272,43 @@ def test_serving_chat_could_load_correct_generation_config():
 
     assert mock_engine.generate.call_args.args[1].temperature == 0.0
     assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
+
+
+def test_serving_chat_did_set_correct_cache_salt():
+    mock_model_config = MockModelConfig()
+
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+
+    # Initialize the serving chat
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)
+
+    # Test cache_salt
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "what is 1+1?"
+        }],
+    )
+
+    # By default cache_salt in the engine prompt is not set
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+    assert "cache_salt" not in mock_engine.generate.call_args.args[0]
+
+    # Test with certain cache_salt
+    req.cache_salt = "test_salt"
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+    assert mock_engine.generate.call_args.args[0]["cache_salt"] == "test_salt"
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index f8e213b9c..079100e78 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -60,8 +60,16 @@ def _run_incremental_decode(tokenizer,
         skip_special_tokens=skip_special_tokens,
         spaces_between_special_tokens=spaces_between_special_tokens,
     )
-    request = EngineCoreRequest("", prompt_token_ids, None, None, None, params,
-                                None, 0.0, None)
+    request = EngineCoreRequest("",
+                                prompt_token_ids,
+                                None,
+                                None,
+                                None,
+                                params,
+                                None,
+                                0.0,
+                                None,
+                                cache_salt=None)
 
     if fast is None:
         detokenizer = IncrementalDetokenizer.from_new_request(
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index e73e08e74..e8069b8c6 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -29,7 +29,8 @@ from vllm.v1.request import Request
 def make_request(request_id,
                  prompt_token_ids,
                  mm_positions=None,
-                 mm_hashes=None):
+                 mm_hashes=None,
+                 cache_salt=None):
     if mm_positions is None:
         multi_modal_inputs = None
     else:
@@ -45,6 +46,7 @@ def make_request(request_id,
         eos_token_id=100,
         arrival_time=0,
         lora_request=None,
+        cache_salt=cache_salt,
     )
 
 
@@ -213,6 +215,45 @@ def test_generate_block_hash_extra_keys_no_mm_inputs():
     assert next_mm_idx == 0
 
 
+def test_generate_block_hash_extra_keys_cache_salt():
+    request = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(6)],
+        mm_positions=None,
+        mm_hashes=None,
+        cache_salt="salt",
+    )
+
+    # salt is added for the first token
+    extra_keys, _ = generate_block_hash_extra_keys(request, 0, 1, 0)
+    assert extra_keys == ('salt', )
+    extra_keys, _ = generate_block_hash_extra_keys(request, 0, 10, 0)
+    assert extra_keys == ('salt', )
+
+    # no salt added for other tokens
+    extra_keys, _ = generate_block_hash_extra_keys(request, 1, 2, 0)
+    assert extra_keys is None
+    extra_keys, _ = generate_block_hash_extra_keys(request, 6, 10, 0)
+    assert extra_keys is None
+
+    # works together with other extra keys
+    request_mm = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(20)],
+        mm_positions=[
+            PlaceholderRange(offset=0, length=5),
+        ],
+        mm_hashes=["hash1"],
+        cache_salt="salt",
+    )
+
+    # Test with no extra keys
+    extra_keys, next_mm_idx = generate_block_hash_extra_keys(
+        request_mm, 0, 5, 0)
+    assert extra_keys == ("hash1", "salt")
+    assert next_mm_idx == 1
+
+
 @pytest.mark.parametrize("hash_fn", [sha256, hash])
 def test_hash_block_tokens(hash_fn):
     parent_block_hash = 123
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index b2e8ff614..ae4bd95d2 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -21,7 +21,8 @@ def make_request(request_id,
                  prompt_token_ids,
                  mm_positions=None,
                  mm_hashes=None,
-                 prompt_logprobs: Optional[int] = None):
+                 prompt_logprobs: Optional[int] = None,
+                 cache_salt: Optional[str] = None):
     if mm_positions is None:
         multi_modal_inputs = None
     else:
@@ -38,6 +39,7 @@ def make_request(request_id,
         eos_token_id=100,
         arrival_time=0,
         lora_request=None,
+        cache_salt=cache_salt,
     )
 
 
@@ -603,6 +605,66 @@ def test_mm_prefix_caching():
     assert num_computed_tokens == 3 * 16
 
 
+def test_cache_key_salting():
+    """
+    This tests that cache salts are applied during hashing and the cache
+    is separated cache as expected.
+    """
+    block_size = 16
+    manager = KVCacheManager(
+        make_kv_cache_config(block_size, 11),
+        max_model_len=8192,
+        enable_caching=True,
+    )
+
+    # 3 complete blocks and an incomplete block with 11 tokens.
+    common_token_ids = [i for i in range(3) for _ in range(block_size)]
+    token_ids = common_token_ids + [3] * 11
+    req0 = make_request("0", token_ids, cache_salt="salt1")
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
+
+    # Completed block should have hashes with extra keys.
+    assert not computed_blocks
+    assert num_computed_tokens == 0
+    block_hashes = manager.req_to_block_hashes[req0.request_id]
+    assert len(block_hashes) == 3
+    assert block_hashes[0].extra_keys == ("salt1", )
+    assert block_hashes[1].extra_keys is None
+    assert block_hashes[2].extra_keys is None
+
+    blocks = manager.allocate_slots(req0, 59, computed_blocks)
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4]
+    req0.num_computed_tokens = 59
+
+    # Append slots without allocating a new block.
+    for _ in range(5):
+        req0.append_output_token_ids(8)
+    new_blocks = manager.allocate_slots(req0, 5)
+    assert new_blocks is not None and len(new_blocks) == 0
+
+    # Now one more block that should not have extra keys.
+    assert len(block_hashes) == 4
+    assert block_hashes[3].extra_keys is None
+
+    # Test cache hit with a new request that has the same salt.
+    token_ids = common_token_ids + [4] * 11
+    req1 = make_request("1", token_ids, cache_salt="salt1")
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+    # Should match only a prefix of 3 blocks.
+    assert len(computed_blocks) == 3
+    assert num_computed_tokens == 3 * block_size
+
+    # Test cache miss with same content but different salt.
+    token_ids = common_token_ids + [4] * 11
+    req2 = make_request("2", token_ids, cache_salt="salt2")
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
+    assert len(computed_blocks) == 0
+    assert num_computed_tokens == 0
+    block_hashes = manager.req_to_block_hashes[req2.request_id]
+    assert len(block_hashes) == 3
+    assert block_hashes[0].extra_keys == ("salt2", )
+
+
 def test_prefill_not_enough_free_blocks_with_computed_blocks():
     """
     This is a unit test that tests the correctness of the allocate_slots
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 30fa9e371..dcf494825 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -40,6 +40,7 @@ def make_request() -> EngineCoreRequest:
         eos_token_id=None,
         arrival_time=time.time(),
         lora_request=None,
+        cache_salt=None,
     )
 
 
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 8cc36fa16..5514a3284 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -43,6 +43,7 @@ def make_request(params: SamplingParams) -> EngineCoreRequest:
         eos_token_id=None,
         arrival_time=time.time(),
         lora_request=None,
+        cache_salt=None,
     )
 
 
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index d2bb7d88f..fac701c4c 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -57,6 +57,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind,
                           mm_placeholders=None,
                           eos_token_id=None,
                           lora_request=None,
+                          cache_salt=None,
                           sampling_params=SamplingParams(
                               skip_special_tokens=False,
                               spaces_between_special_tokens=False,
@@ -403,6 +404,7 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
                           mm_placeholders=None,
                           eos_token_id=None,
                           lora_request=None,
+                          cache_salt=None,
                           sampling_params=SamplingParams(
                               skip_special_tokens=False,
                               spaces_between_special_tokens=False,
@@ -503,7 +505,7 @@ def test_stop_token(include_stop_str_in_output: bool,
       reason should be "stop" (i.e. first control token causes stop
       and is represented in output text)
 
-    * else, the detokenized string should be 
+    * else, the detokenized string should be
       ... and the finish reason should be "stop"
       (i.e. first control token causes stop but is not represented
       in output text.)
@@ -565,6 +567,7 @@ def test_stop_token(include_stop_str_in_output: bool,
         mm_placeholders=None,
         eos_token_id=eos_token_id,
         lora_request=None,
+        cache_salt=None,
         sampling_params=SamplingParams(
             skip_special_tokens=False,
             spaces_between_special_tokens=False,
@@ -661,6 +664,7 @@ def test_stop_string(include_stop_str_in_output: bool,
             mm_placeholders=None,
             eos_token_id=None,
             lora_request=None,
+            cache_salt=None,
             sampling_params=SamplingParams(
                 skip_special_tokens=False,
                 spaces_between_special_tokens=False,
@@ -774,6 +778,7 @@ def test_iteration_stats(dummy_test_vectors):
             mm_placeholders=None,
             eos_token_id=None,
             lora_request=None,
+            cache_salt=None,
             sampling_params=SamplingParams(),
         ) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index d444442a9..389557dfb 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -14,6 +14,7 @@ from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
                       ValidationInfo, field_validator, model_validator)
 from typing_extensions import TypeAlias
 
+from vllm import envs
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 from vllm.logger import init_logger
 from vllm.pooling_params import PoolingParams
@@ -408,6 +409,15 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "If specified with 'logprobs', tokens are represented "
             " as strings of the form 'token_id:{token_id}' so that tokens "
             "that are not JSON-encodable can be identified."))
+    cache_salt: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, the prefix cache will be salted with the provided "
+            "string to prevent an attacker to guess prompts in multi-user "
+            "environments. The salt should be random, protected from "
+            "access by 3rd parties, and long enough to be "
+            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
+            "to 256 bit). Not supported by vLLM engine V0."))
 
     # doc: end-chat-completion-extra-params
 
@@ -726,6 +736,20 @@ class ChatCompletionRequest(OpenAIBaseModel):
                              "`add_generation_prompt` to True.")
         return data
 
+    @model_validator(mode="before")
+    @classmethod
+    def check_cache_salt_support(cls, data):
+        if data.get("cache_salt") is not None:
+            if not envs.VLLM_USE_V1:
+                raise ValueError(
+                    "Parameter 'cache_salt' is not supported with "
+                    "this instance of vLLM, which uses engine V0.")
+            if not isinstance(data["cache_salt"],
+                              str) or not data["cache_salt"]:
+                raise ValueError("Parameter 'cache_salt' must be a "
+                                 "non-empty string if provided.")
+        return data
+
 
 class CompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
@@ -1622,9 +1646,9 @@ class TranscriptionRequest(OpenAIBaseModel):
 
     # doc: begin-transcription-extra-params
     stream: Optional[bool] = False
-    """Custom field not present in the original OpenAI definition. When set, 
+    """Custom field not present in the original OpenAI definition. When set,
     it will enable output to be streamed in a similar fashion as the Chat
-    Completion endpoint. 
+    Completion endpoint.
     """
     # Flattened stream option to simplify form data.
     stream_include_usage: Optional[bool] = False
@@ -1642,7 +1666,7 @@ class TranscriptionRequest(OpenAIBaseModel):
     """
 
     top_p: Optional[float] = None
-    """Enables nucleus (top-p) sampling, where tokens are selected from the 
+    """Enables nucleus (top-p) sampling, where tokens are selected from the
     smallest possible set whose cumulative probability exceeds `p`.
     """
 
@@ -1650,7 +1674,7 @@ class TranscriptionRequest(OpenAIBaseModel):
     """Limits sampling to the `k` most probable tokens at each step."""
 
     min_p: Optional[float] = None
-    """Filters out tokens with a probability lower than `min_p`, ensuring a 
+    """Filters out tokens with a probability lower than `min_p`, ensuring a
     minimum likelihood threshold during sampling.
     """
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index c3121eff5..6123811aa 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -470,6 +470,9 @@ class OpenAIServing:
         if request.mm_processor_kwargs is not None:
             engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs
 
+        if hasattr(request, "cache_salt") and request.cache_salt is not None:
+            engine_prompt["cache_salt"] = request.cache_salt
+
         return conversation, [request_prompt], [engine_prompt]
 
     def _log_inputs(
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 970b36bca..167189ed1 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -28,6 +28,11 @@ class TextPrompt(TypedDict):
     to pass the mm_processor_kwargs to each of them.
     """
 
+    cache_salt: NotRequired[str]
+    """
+    Optional cache salt to be used for prefix caching.
+    """
+
 
 class TokensPrompt(TypedDict):
     """Schema for a tokenized prompt."""
@@ -52,6 +57,11 @@ class TokensPrompt(TypedDict):
     to pass the mm_processor_kwargs to each of them.
     """
 
+    cache_salt: NotRequired[str]
+    """
+    Optional cache salt to be used for prefix caching.
+    """
+
 
 SingletonPrompt = Union[str, TextPrompt, TokensPrompt]
 """
@@ -141,11 +151,17 @@ class TokenInputs(TypedDict):
     The original prompt text corresponding to the token IDs, if available.
     """
 
+    cache_salt: NotRequired[str]
+    """
+    Optional cache salt to be used for prefix caching.
+    """
+
 
 def token_inputs(
     prompt_token_ids: list[int],
     token_type_ids: Optional[list[int]] = None,
     prompt: Optional[str] = None,
+    cache_salt: Optional[str] = None,
 ) -> TokenInputs:
     """Construct :class:`TokenInputs` from optional values."""
     inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)
@@ -154,6 +170,8 @@ def token_inputs(
         inputs["prompt"] = prompt
     if token_type_ids is not None:
         inputs["token_type_ids"] = token_type_ids
+    if cache_salt is not None:
+        inputs["cache_salt"] = cache_salt
 
     return inputs
 
@@ -217,7 +235,7 @@ def zip_enc_dec_prompts(
     """
     Zip encoder and decoder prompts together into a list of
     :class:`ExplicitEncoderDecoderPrompt` instances.
-    
+
     ``mm_processor_kwargs`` may also be provided; if a dict is passed, the same
     dictionary will be used for every encoder/decoder prompt. If an iterable is
     provided, it will be zipped with the encoder/decoder prompts.
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 56b60b893..83e6907f8 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -17,7 +17,8 @@ from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 
 from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs,
                    PromptType, SingletonInputs, SingletonPrompt, token_inputs)
-from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt
+from .parse import (ParsedStrPrompt, ParsedTextPrompt, ParsedTokensPrompt,
+                    is_explicit_encoder_decoder_prompt, parse_singleton_prompt)
 
 logger = init_logger(__name__)
 
@@ -283,6 +284,29 @@ class InputPreprocessor:
         return mm_processor.apply(prompt, mm_data, mm_processor_kwargs,
                                   return_mm_hashes)
 
+    def _get_prompt_data(self, parsed_prompt: Union[ParsedStrPrompt,
+                                                    ParsedTextPrompt,
+                                                    ParsedTokensPrompt]):
+        prompt_text = None
+        prompt_token_ids = None
+        token_type_ids = None
+        cache_salt = None
+
+        if parsed_prompt["type"] == "str":
+            prompt_text = parsed_prompt["content"]
+        else:
+            cache_salt = parsed_prompt["content"].get("cache_salt")
+            if parsed_prompt["type"] == "text":
+                prompt_text = parsed_prompt["content"]["prompt"]
+            elif parsed_prompt["type"] == "tokens":
+                prompt_token_ids = parsed_prompt["content"].get(
+                    "prompt_token_ids")
+                token_type_ids = parsed_prompt["content"].get("token_type_ids")
+            else:
+                assert_never(parsed_prompt)
+
+        return prompt_text, prompt_token_ids, token_type_ids, cache_salt
+
     def _prompt_to_llm_inputs(
         self,
         prompt: SingletonPrompt,
@@ -304,70 +328,36 @@ class InputPreprocessor:
         * :class:`SingletonInputs` instance
         """
         parsed = parse_singleton_prompt(prompt)
-
-        if parsed["type"] == "str":
-            prompt_text = parsed["content"]
-            prompt_token_ids = self._tokenize_prompt(
-                prompt_text,
+        prompt_text, prompt_token_ids, token_type_ids, cache_salt = \
+            self._get_prompt_data(parsed)
+
+        # If multimodal data is present, process and return immediately
+        if parsed["type"] != "str" and parsed["content"].get(
+                "multi_modal_data") is not None:
+            inputs = self._process_multimodal(
+                prompt_text if prompt_text is not None else prompt_token_ids,
+                parsed["content"]["multi_modal_data"],
+                parsed["content"].get("mm_processor_kwargs"),
                 lora_request=lora_request,
-                tokenization_kwargs=tokenization_kwargs,
-            )
-
-            return token_inputs(
-                prompt=prompt_text,
-                prompt_token_ids=prompt_token_ids,
+                return_mm_hashes=return_mm_hashes,
             )
+            if cache_salt is not None:
+                inputs["cache_salt"] = cache_salt
+            return inputs
 
-        if parsed["type"] == "tokens":
-            tokens_content = parsed["content"]
-
-            prompt_token_ids = tokens_content["prompt_token_ids"]
-            token_type_ids = tokens_content.get("token_type_ids")
-            multi_modal_data = tokens_content.get("multi_modal_data")
-            mm_processor_kwargs = tokens_content.get("mm_processor_kwargs")
-
-            if multi_modal_data is not None:
-                return self._process_multimodal(
-                    prompt_token_ids,
-                    multi_modal_data,
-                    mm_processor_kwargs,
-                    lora_request=lora_request,
-                    return_mm_hashes=return_mm_hashes,
-                )
-
-            return token_inputs(
-                prompt_token_ids=prompt_token_ids,
-                token_type_ids=token_type_ids,
-            )
-
-        if parsed["type"] == "text":
-            text_content = parsed["content"]
-
-            prompt_text = text_content["prompt"]
-            multi_modal_data = text_content.get("multi_modal_data")
-            mm_processor_kwargs = text_content.get("mm_processor_kwargs")
-
-            if multi_modal_data is not None:
-                return self._process_multimodal(
-                    prompt_text,
-                    multi_modal_data,
-                    mm_processor_kwargs,
-                    lora_request=lora_request,
-                    return_mm_hashes=return_mm_hashes,
-                )
-
+        if prompt_token_ids is None:
             prompt_token_ids = self._tokenize_prompt(
                 prompt_text,
                 lora_request=lora_request,
                 tokenization_kwargs=tokenization_kwargs,
             )
 
-            return token_inputs(
-                prompt=prompt_text,
-                prompt_token_ids=prompt_token_ids,
-            )
-
-        assert_never(parsed)
+        return token_inputs(
+            prompt=prompt_text,
+            prompt_token_ids=prompt_token_ids,
+            token_type_ids=token_type_ids,
+            cache_salt=cache_salt,
+        )
 
     async def _prompt_to_llm_inputs_async(
         self,
@@ -379,64 +369,35 @@ class InputPreprocessor:
         """Async version of :meth:`_extract_prompt_components`."""
         parsed = parse_singleton_prompt(prompt)
 
-        if parsed["type"] == "str":
-            prompt_text = parsed["content"]
-            prompt_token_ids = await self._tokenize_prompt_async(
-                prompt_text,
-                lora_request=lora_request,
-                tokenization_kwargs=tokenization_kwargs,
-            )
+        prompt_text, prompt_token_ids, token_type_ids, cache_salt = \
+            self._get_prompt_data(parsed)
 
-            return token_inputs(
-                prompt=prompt_text,
-                prompt_token_ids=prompt_token_ids,
+        if parsed["type"] != "str" and parsed["content"].get(
+                "multi_modal_data") is not None:
+            inputs = await self._process_multimodal_async(
+                prompt_token_ids if prompt_text is None else prompt_text,
+                parsed["content"]["multi_modal_data"],
+                parsed["content"].get("mm_processor_kwargs"),
+                lora_request=lora_request,
+                return_mm_hashes=return_mm_hashes,
             )
+            if cache_salt is not None:
+                inputs["cache_salt"] = cache_salt
+            return inputs
 
-        if parsed["type"] == "tokens":
-            tokens_content = parsed["content"]
-
-            prompt_token_ids = tokens_content["prompt_token_ids"]
-            multi_modal_data = tokens_content.get("multi_modal_data")
-            mm_processor_kwargs = tokens_content.get("mm_processor_kwargs")
-
-            if multi_modal_data is not None:
-                return await self._process_multimodal_async(
-                    prompt_token_ids,
-                    multi_modal_data,
-                    mm_processor_kwargs,
-                    lora_request=lora_request,
-                    return_mm_hashes=return_mm_hashes,
-                )
-
-            return token_inputs(prompt_token_ids=prompt_token_ids)
-
-        if parsed["type"] == "text":
-            text_content = parsed["content"]
-
-            prompt_text = text_content["prompt"]
-            multi_modal_data = text_content.get("multi_modal_data")
-            mm_processor_kwargs = text_content.get("mm_processor_kwargs")
-
-            if multi_modal_data is not None:
-                return await self._process_multimodal_async(
-                    prompt_text,
-                    multi_modal_data,
-                    mm_processor_kwargs,
-                    lora_request=lora_request,
-                    return_mm_hashes=return_mm_hashes,
-                )
-
+        if prompt_token_ids is None:
             prompt_token_ids = await self._tokenize_prompt_async(
                 prompt_text,
                 lora_request=lora_request,
+                tokenization_kwargs=tokenization_kwargs,
             )
 
-            return token_inputs(
-                prompt=prompt_text,
-                prompt_token_ids=prompt_token_ids,
-            )
-
-        assert_never(parsed)
+        return token_inputs(
+            prompt=prompt_text,
+            prompt_token_ids=prompt_token_ids,
+            token_type_ids=token_type_ids,
+            cache_salt=cache_salt,
+        )
 
     def _build_enc_dec_llm_inputs(
         self,
@@ -516,6 +477,11 @@ class InputPreprocessor:
                     mm_hashes=inputs["mm_hashes"],
                     mm_placeholders=inputs["mm_placeholders"],
                 )
+
+            cache_salt = inputs.get("cache_salt")
+            if cache_salt is not None:
+                decoder_inputs["cache_salt"] = cache_salt
+
         elif inputs["type"] == "token":
             # Text-only inputs
             encoder_inputs = token_inputs(prompt="", prompt_token_ids=[])
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 6855808e8..978fb4231 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -826,6 +826,11 @@ class MultiModalInputs(TypedDict):
     :code:`prompt_token_ids`.
     """
 
+    cache_salt: NotRequired[str]
+    """
+    Optional cache salt to be used for prefix caching.
+    """
+
 
 class MultiModalEncDecInputs(MultiModalInputs):
     """
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index d6ba8f1bc..e8745a8f1 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1789,7 +1789,7 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
         mm_data: MultiModalDataDict,
     ) -> Union[str, list[int]]:
         """
-        Create input prompt for the encoder. HF processor will be applied on 
+        Create input prompt for the encoder. HF processor will be applied on
         this prompt during profiling and generation.
         """
         raise NotImplementedError
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 3026ecc1c..27c515835 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -275,7 +275,10 @@ def need_extra_keys(request: Request) -> bool:
 
     # Multimodal requests need to include the MM hash.
     # LoRA requests need to include the LoRA ID.
-    return bool(request.mm_positions) or (request.lora_request is not None)
+    # Request with provided cache salt need to include the salt.
+    return bool(request.mm_positions) or (request.lora_request
+                                          is not None) or (request.cache_salt
+                                                           is not None)
 
 
 def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
@@ -380,8 +383,10 @@ def generate_block_hash_extra_keys(
     mm_extra_keys, new_start_mm_idx = _gen_mm_extra_hash_keys(
         request, start_token_idx, end_token_idx, start_mm_idx)
     lora_extra_keys: list[int] = _gen_lora_extra_hash_keys(request)
+    cache_salt_keys: list[str] = [request.cache_salt] if (
+        start_token_idx == 0 and request.cache_salt) else []
 
-    extra_keys: list[Any] = lora_extra_keys + mm_extra_keys
+    extra_keys: list[Any] = lora_extra_keys + mm_extra_keys + cache_salt_keys
 
     if not extra_keys:
         return None, new_start_mm_idx
@@ -657,10 +662,10 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
 def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
     """
     Only models with one type of KV cache are supported yet. This function tries
-    to convert the KV cache specs to one type if the model is a hybrid model 
+    to convert the KV cache specs to one type if the model is a hybrid model
     with multiple type of KV cache. It will convert all SlidingWindowSpec to
     FullAttentionSpec if both types are present.
-    
+
     Args:
         kv_cache_spec: The kv cache spec of each attention layer in the model
     """
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 047466961..e33d1a1e5 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -57,6 +57,7 @@ class EngineCoreRequest(
     eos_token_id: Optional[int]
     arrival_time: float
     lora_request: Optional[LoRARequest]
+    cache_salt: Optional[str]
 
     # Used in DP case to indicate which wave of requests this is expected to
     # belong to, to cover a race condition where the request is sent before
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index b98a31773..27d70a781 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -317,6 +317,7 @@ class Processor:
             eos_token_id=eos_token_id,
             arrival_time=arrival_time,
             lora_request=lora_request,
+            cache_salt=decoder_inputs.get("cache_salt"),
         )
 
     def _validate_model_inputs(self,
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 3b9b666f9..fde366d61 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -29,6 +29,7 @@ class Request:
         arrival_time: float,
         lora_request: Optional["LoRARequest"] = None,
         structured_output_request: Optional["StructuredOutputRequest"] = None,
+        cache_salt: Optional[str] = None,
     ) -> None:
         self.request_id = request_id
         self.sampling_params = sampling_params
@@ -51,6 +52,7 @@ class Request:
         self._all_token_ids: list[int] = self.prompt_token_ids.copy()
         self.spec_token_ids: list[int] = []
         self.num_computed_tokens = 0
+        self.cache_salt: Optional[str] = cache_salt
 
         # Multi-modal related
         self.mm_positions = multi_modal_placeholders or []
@@ -89,6 +91,7 @@ class Request:
             lora_request=request.lora_request,
             structured_output_request=StructuredOutputRequest(
                 sampling_params=request.sampling_params),
+            cache_salt=request.cache_salt,
         )
 
     def append_output_token_ids(
-- 
GitLab


From 0be6d05b5ea92dae61cd83f26fea05a48d227802 Mon Sep 17 00:00:00 2001
From: Alec <35311602+alec-flowers@users.noreply.github.com>
Date: Wed, 30 Apr 2025 16:44:45 +0200
Subject: [PATCH 059/461] [V1][Metrics] add support for kv event publishing
 (#16750)

Signed-off-by: alec-flowers 
Signed-off-by: Mark McLoughlin 
Co-authored-by: Mark McLoughlin 
---
 examples/online_serving/kv_events.sh          |  86 +++++
 .../online_serving/kv_events_subscriber.py    | 114 +++++++
 tests/distributed/conftest.py                 | 145 +++++++++
 tests/distributed/test_events.py              | 193 ++++++++++++
 tests/v1/core/test_prefix_caching.py          |  62 +++-
 tests/v1/engine/conftest.py                   |   2 +
 tests/v1/engine/test_engine_core_client.py    | 164 +++++++---
 vllm/config.py                                |  60 ++++
 vllm/distributed/kv_events.py                 | 295 ++++++++++++++++++
 vllm/engine/arg_utils.py                      |  22 +-
 vllm/v1/core/block_pool.py                    |  49 ++-
 vllm/v1/core/kv_cache_manager.py              |  14 +-
 vllm/v1/core/sched/interface.py               |   5 +
 vllm/v1/core/sched/scheduler.py               |  21 +-
 vllm/v1/engine/core.py                        |   2 +
 15 files changed, 1183 insertions(+), 51 deletions(-)
 create mode 100644 examples/online_serving/kv_events.sh
 create mode 100644 examples/online_serving/kv_events_subscriber.py
 create mode 100644 tests/distributed/conftest.py
 create mode 100644 tests/distributed/test_events.py
 create mode 100644 vllm/distributed/kv_events.py

diff --git a/examples/online_serving/kv_events.sh b/examples/online_serving/kv_events.sh
new file mode 100644
index 000000000..a111db217
--- /dev/null
+++ b/examples/online_serving/kv_events.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+# This file demonstrates the KV cache event publishing
+# We will launch a vllm instances configured to publish KV cache
+# events and launch a simple subscriber to log those events.
+
+set -xe
+
+echo "🚧🚧 Warning: The usage of KV cache events is experimental and subject to change 🚧🚧"
+sleep 1
+
+MODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct}
+
+# Trap the SIGINT signal (triggered by Ctrl+C)
+trap 'cleanup' INT
+
+# Cleanup function
+cleanup() {
+    echo "Caught Ctrl+C, cleaning up..."
+    # Cleanup commands
+    pgrep python | xargs kill -9
+    pkill -f python
+    echo "Cleanup complete. Exiting."
+    exit 0
+}
+
+export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+# a function that waits vLLM server to start
+wait_for_server() {
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+vllm serve $MODEL_NAME \
+    --port 8100 \
+    --max-model-len 100 \
+    --enforce-eager \
+    --gpu-memory-utilization 0.8 \
+    --trust-remote-code \
+    --kv-events-config \
+    '{"enable_kv_cache_events": true, "publisher": "zmq", "topic": "kv-events"}' &
+
+wait_for_server 8100
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+python3 "$SCRIPT_DIR/kv_events_subscriber.py" &
+sleep 1
+
+# serve two example requests
+output1=$(curl -X POST -s http://localhost:8100/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "'"$MODEL_NAME"'",
+"prompt": "Explain quantum computing in simple terms a 5-year-old could understand.",
+"max_tokens": 80,
+"temperature": 0
+}')
+
+output2=$(curl -X POST -s http://localhost:8100/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "'"$MODEL_NAME"'",
+"prompt": "Explain quantum computing in simple terms a 50-year-old could understand.",
+"max_tokens": 80,
+"temperature": 0
+}')
+
+# Cleanup commands
+pkill -9 -u "$USER" -f python
+pkill -9 -u "$USER" -f vllm
+
+sleep 1
+
+echo "Cleaned up"
+
+# Print the outputs of the curl requests
+echo ""
+echo "Output of first request: $output1"
+echo "Output of second request: $output2"
+
+echo "🎉🎉 Successfully finished 2 test requests! 🎉🎉"
+echo ""
diff --git a/examples/online_serving/kv_events_subscriber.py b/examples/online_serving/kv_events_subscriber.py
new file mode 100644
index 000000000..88bbbebd7
--- /dev/null
+++ b/examples/online_serving/kv_events_subscriber.py
@@ -0,0 +1,114 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any, Optional, Union
+
+import msgspec
+import zmq
+from msgspec.msgpack import Decoder
+
+
+#
+# Types copied from vllm.distributed.kv_events
+#
+class EventBatch(msgspec.Struct, array_like=True, omit_defaults=True,
+                 gc=False):
+    ts: float
+    events: list[Any]
+
+
+class KVCacheEvent(msgspec.Struct,
+                   array_like=True,
+                   omit_defaults=True,
+                   gc=False,
+                   tag=True):
+    """Base class for all KV cache-related events"""
+
+
+class BlockStored(KVCacheEvent):
+    block_hashes: list[int]
+    parent_block_hash: Optional[int]
+    token_ids: list[int]
+    block_size: int
+    lora_id: Optional[int]
+
+
+class BlockRemoved(KVCacheEvent):
+    block_hashes: list[int]
+
+
+class AllBlocksCleared(KVCacheEvent):
+    pass
+
+
+class KVEventBatch(EventBatch):
+    events: list[Union[BlockStored, BlockRemoved, AllBlocksCleared]]
+
+
+def process_event(event_batch):
+    print(f"Received event batch at {event_batch.ts}:")
+    for event in event_batch.events:
+        print(f"  - {event}")
+
+
+def main():
+    decoder = Decoder(type=KVEventBatch)
+    last_seq = -1
+
+    context = zmq.Context()
+
+    # Set up the main subscription socket
+    sub = context.socket(zmq.SUB)
+    sub.connect("tcp://localhost:5557")
+    topic = "kv-events"
+    sub.setsockopt_string(zmq.SUBSCRIBE, topic)
+
+    # Initialize replay socket
+    replay = context.socket(zmq.REQ)
+    replay.connect("tcp://localhost:5558")
+    poller = zmq.Poller()
+    poller.register(replay, zmq.POLLIN)
+
+    print("Listening for KV cache events on topic:", topic)
+
+    while True:
+        try:
+            if sub.poll(50):
+                _, seq_bytes, payload = sub.recv_multipart()
+                seq = int.from_bytes(seq_bytes, "big")
+
+                if last_seq >= 0 and seq > last_seq + 1:
+                    missed = seq - last_seq - 1
+                    print(f"Missed {missed} messages"
+                          f" (last: {last_seq}, current: {seq})")
+
+                    replay.send((last_seq + 1).to_bytes(8, "big"))
+
+                    while poller.poll(timeout=200):
+                        seq_bytes, replay_payload = replay.recv_multipart()
+                        if not replay_payload:
+                            # End of replay marker is sent as an empty frame
+                            # for the payload
+                            break
+
+                        replay_seq = int.from_bytes(seq_bytes, "big")
+
+                        if replay_seq > last_seq:
+                            event_batch = decoder.decode(replay_payload)
+                            process_event(event_batch)
+                            last_seq = replay_seq
+                            if replay_seq >= seq - 1:
+                                break
+
+                event_batch = decoder.decode(payload)
+                process_event(event_batch)
+
+            # ... do other periodic work or check for shutdown ...
+
+        except KeyboardInterrupt:
+            print("Interrupted")
+            break
+        except Exception as e:
+            print("Error decoding message:", e)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/distributed/conftest.py b/tests/distributed/conftest.py
new file mode 100644
index 000000000..ee8f20979
--- /dev/null
+++ b/tests/distributed/conftest.py
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: Apache-2.0
+import random
+from typing import Optional, Union
+
+import msgspec
+import msgspec.msgpack
+import pytest
+import zmq
+
+from vllm.config import KVEventsConfig
+from vllm.distributed.kv_events import EventPublisherFactory
+
+from .test_events import SampleBatch
+
+
+@pytest.fixture
+def random_port():
+    """Generate a random port number for testing"""
+    return random.randint(10000, 60000)
+
+
+@pytest.fixture
+def publisher_config(random_port, request):
+    """Create a publisher config with inproc transport"""
+    how = request.param if hasattr(request, "param") else "inproc"
+
+    if how == "inproc":
+        endpoint = f"inproc://test-{random_port}"
+        replay_endpoint = endpoint + "-replay"
+    else:
+        endpoint = f"tcp://*:{random_port}"
+        replay_endpoint = f"tcp://*:{random_port + 1}"
+
+    return KVEventsConfig(enable_kv_cache_events=True,
+                          publisher="zmq",
+                          endpoint=endpoint,
+                          replay_endpoint=replay_endpoint,
+                          buffer_steps=100,
+                          hwm=1000,
+                          topic="test")
+
+
+@pytest.fixture
+def publisher(publisher_config):
+    """Create and return a publisher instance"""
+    pub = EventPublisherFactory.create(publisher_config)
+    yield pub
+    pub.shutdown()
+
+
+@pytest.fixture
+def subscriber(publisher_config):
+    """Create and return a subscriber for testing"""
+    endpoint = publisher_config.endpoint
+    replay_endpoint = publisher_config.replay_endpoint
+
+    if endpoint.startswith("tcp://*"):
+        endpoint = endpoint.replace("*", "127.0.0.1")
+    if replay_endpoint and replay_endpoint.startswith("tcp://*"):
+        replay_endpoint = replay_endpoint.replace("*", "127.0.0.1")
+
+    sub = MockSubscriber(endpoint, replay_endpoint, publisher_config.topic)
+    yield sub
+    sub.close()
+
+
+class MockSubscriber:
+    """Helper class to receive and verify published events"""
+
+    def __init__(self,
+                 pub_endpoint: str,
+                 replay_endpoint: Optional[str] = None,
+                 topic: str = "",
+                 decode_type=SampleBatch):
+        self.ctx = zmq.Context.instance()
+
+        # Set up subscriber socket
+        self.sub = self.ctx.socket(zmq.SUB)
+        self.sub.setsockopt(zmq.SUBSCRIBE, topic.encode('utf-8'))
+        self.sub.connect(pub_endpoint)
+
+        # Set up replay socket if provided
+        self.replay = None
+        if replay_endpoint:
+            self.replay = self.ctx.socket(zmq.REQ)
+            self.replay.connect(replay_endpoint)
+
+        self.topic = topic
+        self.topic_bytes = topic.encode('utf-8')
+        self.received_msgs: list[tuple[int, SampleBatch]] = []
+        self.last_seq = -1
+        self.decoder = msgspec.msgpack.Decoder(type=decode_type)
+
+    def receive_one(self,
+                    timeout=1000) -> Union[tuple[int, SampleBatch], None]:
+        """Receive a single message with timeout"""
+        if not self.sub.poll(timeout):
+            return None
+
+        topic_bytes, seq_bytes, payload = self.sub.recv_multipart()
+        assert topic_bytes == self.topic_bytes
+
+        seq = int.from_bytes(seq_bytes, "big")
+        data = self.decoder.decode(payload)
+        self.last_seq = seq
+        self.received_msgs.append((seq, data))
+        return seq, data
+
+    def request_replay(self, start_seq: int) -> None:
+        """Request replay of messages starting from start_seq"""
+        if not self.replay:
+            raise ValueError("Replay socket not initialized")
+
+        self.replay.send(start_seq.to_bytes(8, "big"))
+
+    def receive_replay(self) -> list[tuple[int, SampleBatch]]:
+        """Receive replayed messages"""
+        if not self.replay:
+            raise ValueError("Replay socket not initialized")
+
+        replayed: list[tuple[int, SampleBatch]] = []
+        while True:
+            try:
+                if not self.replay.poll(1000):
+                    break
+
+                frames = self.replay.recv_multipart()
+                if not frames or not frames[-1]:
+                    # End of replay marker
+                    break
+
+                seq_bytes, payload = frames
+                seq = int.from_bytes(seq_bytes, "big")
+                data = self.decoder.decode(payload)
+                replayed.append((seq, data))
+            except zmq.ZMQError as _:
+                break
+
+        return replayed
+
+    def close(self):
+        """Clean up resources"""
+        self.sub.close()
+        if self.replay:
+            self.replay.close()
diff --git a/tests/distributed/test_events.py b/tests/distributed/test_events.py
new file mode 100644
index 000000000..15bcfdb85
--- /dev/null
+++ b/tests/distributed/test_events.py
@@ -0,0 +1,193 @@
+# SPDX-License-Identifier: Apache-2.0
+import threading
+import time
+
+import msgspec
+import pytest
+
+from vllm.distributed.kv_events import (EventBatch, EventPublisherFactory,
+                                        NullEventPublisher)
+
+
+class EventSample(
+        msgspec.Struct,
+        tag=True,  # type: ignore
+        array_like=True  # type: ignore
+):
+    """Test event for publisher testing"""
+    id: int
+    value: str
+
+
+class SampleBatch(EventBatch):
+    """Test event batch for publisher testing"""
+    events: list[EventSample]
+
+
+def create_test_events(count: int) -> SampleBatch:
+    """Create a batch of test events"""
+    events = [EventSample(id=i, value=f"test-{i}") for i in range(count)]
+    return SampleBatch(ts=time.time(), events=events)
+
+
+def test_basic_publishing(publisher, subscriber):
+    """Test basic event publishing works"""
+
+    test_batch = create_test_events(5)
+    publisher.publish(test_batch)
+
+    result = subscriber.receive_one(timeout=1000)
+    assert result is not None, "No message received"
+
+    seq, received = result
+    assert seq == 0, "Sequence number mismatch"
+    assert received.ts == pytest.approx(test_batch.ts,
+                                        abs=0.1), ("Timestamp mismatch")
+    assert len(received.events) == len(
+        test_batch.events), ("Number of events mismatch")
+
+    for i, event in enumerate(received.events):
+        assert event.id == i, "Event id mismatch"
+        assert event.value == f"test-{i}", "Event value mismatch"
+
+
+def test_multiple_events(publisher, subscriber):
+    """Test publishing and receiving multiple event batches"""
+    for _ in range(10):
+        batch = create_test_events(2)
+        publisher.publish(batch)
+
+    received = []
+    for _ in range(10):
+        data = subscriber.receive_one(timeout=100)
+        if data:
+            received.append(data)
+
+    assert len(received) == 10, "Number of messages mismatch"
+    seqs = [seq for seq, _ in received]
+    assert seqs == list(range(10)), "Sequence numbers mismatch"
+
+
+def test_replay_mechanism(publisher, subscriber):
+    """Test the replay mechanism works correctly"""
+    for _ in range(19):
+        batch = create_test_events(1)
+        publisher.publish(batch)
+
+    time.sleep(0.5)  # Need publisher to process above requests
+    subscriber.request_replay(10)
+
+    batch = create_test_events(1)
+    publisher.publish(batch)  # 20th message
+
+    replayed = subscriber.receive_replay()
+
+    assert len(replayed) > 0, "No replayed messages received"
+    seqs = [seq for seq, _ in replayed]
+    assert all(seq >= 10 for seq in seqs), "Replayed messages not in order"
+    assert seqs == list(range(min(seqs),
+                              max(seqs) +
+                              1)), ("Replayed messages not consecutive")
+
+
+def test_buffer_limit(publisher, subscriber, publisher_config):
+    """Test buffer limit behavior"""
+    buffer_size = publisher_config.buffer_steps
+
+    # Publish more events than the buffer can hold
+    for i in range(buffer_size + 10):
+        batch = create_test_events(1)
+        publisher.publish(batch)
+
+    time.sleep(0.5)  # Need publisher to process above requests
+    subscriber.request_replay(0)
+
+    batch = create_test_events(1)
+    publisher.publish(batch)
+
+    replayed = subscriber.receive_replay()
+
+    assert len(replayed) <= buffer_size, "Can't replay more than buffer size"
+
+    oldest_seq = min(seq for seq, _ in replayed)
+    assert oldest_seq >= 10, "The oldest sequence should be at least 10"
+
+
+def test_topic_filtering(publisher_config):
+    """
+    Test that a subscriber only receives messages matching its topic filter
+    """
+    publisher_config.replay_endpoint = None
+
+    cfg = publisher_config.model_copy()
+    cfg.topic = "foo"
+    pub = EventPublisherFactory.create(cfg)
+
+    from .conftest import MockSubscriber
+    sub_foo = MockSubscriber(cfg.endpoint, None, "foo")
+    sub_bar = MockSubscriber(cfg.endpoint, None, "bar")
+
+    try:
+        time.sleep(0.1)
+
+        for _ in range(3):
+            pub.publish(create_test_events(1))
+
+        foo_received = [sub_foo.receive_one(timeout=200) for _ in range(3)]
+        assert all(msg is not None for msg in foo_received), (
+            "Subscriber with matching topic should receive messages")
+
+        bar_received = [sub_bar.receive_one(timeout=200) for _ in range(3)]
+        assert all(msg is None for msg in bar_received), (
+            "Subscriber with non-matching topic should receive no messages")
+    finally:
+        pub.shutdown()
+        sub_foo.close()
+        sub_bar.close()
+
+
+def test_high_volume(publisher, subscriber):
+    """Test publishing and receiving a high volume of events"""
+    num_batches = 10_000
+    events_per_batch = 100
+
+    # Publish events in a separate thread to not block
+    def publish_events():
+        for i in range(num_batches):
+            batch = create_test_events(events_per_batch)
+            publisher.publish(batch)
+            # Small delay to avoid overwhelming
+            if i % 100 == 0:
+                time.sleep(0.01)
+
+    received: list[tuple[int, SampleBatch]] = []
+
+    publisher_thread = threading.Thread(target=publish_events)
+    publisher_thread.start()
+
+    start_time = time.time()
+    while len(received) < num_batches:
+        if time.time() - start_time > 10:  # Timeout after 10 seconds
+            break
+
+        result = subscriber.receive_one(timeout=100)
+        if result:
+            received.append(result)
+
+    publisher_thread.join()
+
+    assert len(received) >= num_batches * 0.9, (
+        "We should have received most messages")
+
+    seqs = [seq for seq, _ in received]
+    assert sorted(seqs) == seqs, "Sequence numbers should be in order"
+
+
+def test_null_publisher():
+    """Test that NullEventPublisher can be used without errors"""
+    publisher = NullEventPublisher()
+
+    # This should not raise any errors
+    batch = create_test_events(5)
+    publisher.publish(batch)
+    publisher.shutdown()
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index ae4bd95d2..af0fef89d 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -6,6 +6,7 @@ from typing import Optional
 import pytest
 import torch
 
+from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.utils import sha256
@@ -48,9 +49,10 @@ def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
         num_blocks=num_blocks,
         tensors={},
         kv_cache_groups=[
-            KVCacheGroupSpec(['layer'],
-                             FullAttentionSpec(block_size, 1, 1, torch.float32,
-                                               False))
+            KVCacheGroupSpec(
+                ["layer"],
+                FullAttentionSpec(block_size, 1, 1, torch.float32, False),
+            )
         ],
     )
 
@@ -783,6 +785,60 @@ def test_prefix_cache_stats_disabled():
     assert manager.prefix_cache_stats is None
 
 
+@pytest.mark.parametrize("blocks_to_cache", [2, 3, 10])
+def test_kv_cache_events(blocks_to_cache: int):
+    block_size = 16
+    num_blocks = blocks_to_cache + 1
+
+    # Allocate Blocks
+    # Should see a single block stored event with a blocks_to_cache number of
+    # block hashes
+    # take_events should reset the kv_event_queue
+    manager = KVCacheManager(
+        make_kv_cache_config(block_size, num_blocks),
+        max_model_len=8192,
+        enable_caching=True,
+        enable_kv_cache_events=True,
+    )
+
+    num_tokens = block_size * blocks_to_cache
+    req0 = make_request("0", list(range(num_tokens)))
+    _ = manager.allocate_slots(req0, num_tokens)
+    events = manager.take_events()
+
+    block = events[-1]
+    assert (len(block.block_hashes) == blocks_to_cache == len(
+        manager.block_pool.cached_block_hash_to_block))
+    assert len(block.token_ids) == block.block_size * len(block.block_hashes)
+    assert len(manager.block_pool.kv_event_queue) == 0
+
+    stored_block_hash = block.block_hashes
+
+    # Remove blocks and send another request
+    # Should see block_to_cache number of removed block events and a new block
+    # stored event
+    manager.free(req0)
+    req1 = make_request("1", list(range(num_tokens)))
+    _ = manager.allocate_slots(req1, num_tokens)
+    events = manager.take_events()
+
+    for blocks in events[:-1]:
+        assert blocks.block_hashes[0] in stored_block_hash
+    assert len(events) == blocks_to_cache + 1
+    assert (isinstance(events[-2], BlockRemoved))
+    assert (len(events[-1].block_hashes) == blocks_to_cache == len(
+        manager.block_pool.cached_block_hash_to_block))
+
+    # All Blocks Cleared
+    # Should see a single all blocks cleared event
+    manager.free(req1)
+    manager.reset_prefix_cache()
+    events = manager.take_events()
+
+    assert isinstance(events[-1], AllBlocksCleared)
+    assert len(manager.block_pool.cached_block_hash_to_block) == 0
+
+
 def test_eagle_enabled_removes_last_block():
     """Verify Eagle does NOT remove blocks when request 
     length is divisible by block size."""
diff --git a/tests/v1/engine/conftest.py b/tests/v1/engine/conftest.py
index f8addd920..d04679c12 100644
--- a/tests/v1/engine/conftest.py
+++ b/tests/v1/engine/conftest.py
@@ -13,6 +13,8 @@ from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST,
 from vllm.engine.arg_utils import EngineArgs
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 
+from ...distributed.conftest import publisher_config, random_port  # noqa: F401
+
 from tests.v1.engine.utils import FULL_STRINGS  # isort: skip
 
 EngineCoreSampleLogprobsType = list[tuple[torch.Tensor, torch.Tensor]]
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 5514a3284..3e1aa5688 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -11,6 +11,7 @@ import pytest
 from transformers import AutoTokenizer
 
 from vllm import SamplingParams
+from vllm.distributed.kv_events import BlockStored, KVEventBatch
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
 from vllm.usage.usage_lib import UsageContext
@@ -20,6 +21,7 @@ from vllm.v1.engine.core_client import (AsyncMPClient, EngineCoreClient,
                                         SyncMPClient)
 from vllm.v1.executor.abstract import Executor
 
+from ...distributed.conftest import MockSubscriber
 from ...utils import create_new_process_for_each_test
 
 if not current_platform.is_cuda():
@@ -199,54 +201,142 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
             log_stats=True,
         )
 
-        MAX_TOKENS = 20
-        params = SamplingParams(max_tokens=MAX_TOKENS)
-        """Normal Request Cycle."""
+        try:
+            MAX_TOKENS = 20
+            params = SamplingParams(max_tokens=MAX_TOKENS)
+            """Normal Request Cycle."""
 
-        requests = [make_request(params) for _ in range(10)]
-        request_ids = [req.request_id for req in requests]
+            requests = [make_request(params) for _ in range(10)]
+            request_ids = [req.request_id for req in requests]
 
-        # Add requests to the engine.
-        for request in requests:
-            await client.add_request_async(request)
-            await asyncio.sleep(0.01)
+            # Add requests to the engine.
+            for request in requests:
+                await client.add_request_async(request)
+                await asyncio.sleep(0.01)
 
-        outputs: dict[str, list] = {req_id: [] for req_id in request_ids}
-        await loop_until_done_async(client, outputs)
+            outputs: dict[str, list] = {req_id: [] for req_id in request_ids}
+            await loop_until_done_async(client, outputs)
 
-        for req_id in request_ids:
-            assert len(outputs[req_id]) == MAX_TOKENS, (
-                f"{outputs[req_id]=}, {MAX_TOKENS=}")
-        """Abort Request Cycle."""
+            for req_id in request_ids:
+                assert len(outputs[req_id]) == MAX_TOKENS, (
+                    f"{outputs[req_id]=}, {MAX_TOKENS=}")
+            """Abort Request Cycle."""
+
+            # Add requests to the engine.
+            for idx, request in enumerate(requests):
+                await client.add_request_async(request)
+                await asyncio.sleep(0.01)
+                if idx % 2 == 0:
+                    await client.abort_requests_async([request.request_id])
+
+            outputs = {req_id: [] for req_id in request_ids}
+            await loop_until_done_async(client, outputs)
+
+            for idx, req_id in enumerate(request_ids):
+                if idx % 2 == 0:
+                    assert len(outputs[req_id]) < MAX_TOKENS, (
+                        f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
+                else:
+                    assert len(outputs[req_id]) == MAX_TOKENS, (
+                        f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
+            """Utility method invocation"""
 
-        # Add requests to the engine.
-        for idx, request in enumerate(requests):
-            await client.add_request_async(request)
-            await asyncio.sleep(0.01)
-            if idx % 2 == 0:
-                await client.abort_requests_async([request.request_id])
+            core_client: AsyncMPClient = client
 
-        outputs = {req_id: [] for req_id in request_ids}
-        await loop_until_done_async(client, outputs)
+            result = await core_client.call_utility_async("echo", "testarg")
+            assert result == "testarg"
 
-        for idx, req_id in enumerate(request_ids):
-            if idx % 2 == 0:
-                assert len(outputs[req_id]) < MAX_TOKENS, (
-                    f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
-            else:
-                assert len(outputs[req_id]) == MAX_TOKENS, (
-                    f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
-        """Utility method invocation"""
+            with pytest.raises(Exception) as e_info:
+                await core_client.call_utility_async("echo", None, "help!")
+
+            assert str(e_info.value) == "Call to echo method failed: help!"
+        finally:
+            client.shutdown()
+
+
+@pytest.mark.parametrize(
+    "multiprocessing_mode,publisher_config",
+    [(True, "tcp"), (False, "inproc")],
+    indirect=["publisher_config"],
+)
+def test_kv_cache_events(
+    monkeypatch: pytest.MonkeyPatch,
+    multiprocessing_mode: bool,
+    publisher_config,
+):
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        block_size = 16
+        num_blocks = 2
 
-        core_client: AsyncMPClient = client
+        engine_args = EngineArgs(model=MODEL_NAME,
+                                 enforce_eager=True,
+                                 enable_prefix_caching=True,
+                                 block_size=block_size)
+        engine_args.kv_events_config = publisher_config
 
-        result = await core_client.call_utility_async("echo", "testarg")
-        assert result == "testarg"
+        vllm_config = engine_args.create_engine_config(
+            UsageContext.UNKNOWN_CONTEXT)
 
-        with pytest.raises(Exception) as e_info:
-            await core_client.call_utility_async("echo", None, "help!")
+        executor_class = Executor.get_class(vllm_config)
+        client = EngineCoreClient.make_client(
+            multiprocess_mode=multiprocessing_mode,
+            asyncio_mode=False,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=False,
+        )
+        endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
+        time.sleep(0.1)
+        subscriber = MockSubscriber(endpoint,
+                                    topic=publisher_config.topic,
+                                    decode_type=KVEventBatch)
+
+        try:
+            custom_tokens = list(range(num_blocks * block_size))
+            request = EngineCoreRequest(
+                request_id=str(uuid.uuid4()),
+                prompt_token_ids=custom_tokens,
+                mm_inputs=None,
+                mm_hashes=None,
+                mm_placeholders=None,
+                sampling_params=SamplingParams(
+                    max_tokens=1),  # Short completion for speed
+                eos_token_id=None,
+                arrival_time=time.time(),
+                lora_request=None,
+            )
+            client.add_request(request)
 
-        assert str(e_info.value) == "Call to echo method failed: help!"
+            outputs: dict[str, list] = {request.request_id: []}
+            loop_until_done(client, outputs)
+
+            result = subscriber.receive_one(timeout=1000)
+            assert result is not None, "No message received"
+
+            seq, received = result
+
+            assert seq == 0, "Sequence number mismatch"
+            assert len(received.events) == 1, (
+                "We should have exactly one BlockStored event")
+            event = received.events[0]
+            assert isinstance(
+                event, BlockStored), ("We should have a BlockStored event")
+            assert len(event.block_hashes) == num_blocks, (
+                "We should have a BlockStored event with 2 block_hashes")
+            assert event.block_size == block_size, (
+                "Block size should be the same as the block size")
+            assert event.parent_block_hash is None, (
+                "Parent block hash should be None")
+            assert event.lora_id is None, "Lora id should be None"
+            assert len(event.token_ids) == num_blocks * block_size, (
+                "Token ids should be the same as the custom tokens")
+            assert event.token_ids == custom_tokens, (
+                "Token ids should be the same as the custom tokens")
+        finally:
+            client.shutdown()
+        return
 
 
 @pytest.mark.timeout(10)
diff --git a/vllm/config.py b/vllm/config.py
index f9c5e25a4..5da1ab258 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1958,6 +1958,8 @@ class SchedulerConfig:
     some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),
     it will be scheduled as TTTT in one step and IIIIIIIIII in the next."""
 
+    # scheduler class or path. "vllm.core.scheduler.Scheduler" (default)
+    # or "mod.custom_class".
     scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler"
     """The scheduler class to use. "vllm.core.scheduler.Scheduler" is the
     default scheduler. Can be a class directly or the path to a class of form
@@ -3417,6 +3419,51 @@ class KVTransferConfig(BaseModel):
         return self.kv_connector_extra_config.get(key, default)
 
 
+class KVEventsConfig(BaseModel):
+    """Configuration for KV event publishing."""
+
+    enable_kv_cache_events: bool = False
+    """If True, enable KV cache events for tracking block storage and removal.
+    Events can be published externally by zmq using the event publisher config.
+    """
+
+    publisher: str = "null"
+    """The publisher to use for publishing kv events. Can be "null", "zmq".
+    """
+
+    endpoint: str = "tcp://*:5557"
+    """The zmq endpoint to use for publishing kv events.
+    """
+
+    replay_endpoint: Optional[str] = None
+    """The zmq endpoint to use for replaying kv events.
+    """
+
+    buffer_steps: int = 10_000
+    """The number of steps to cache for replay endpoint. Will only save
+    events from the last N steps for the replay endpoint.
+    """
+
+    hwm: int = 100_000
+    """The zmq high water mark for the event publisher. After queueing N events,
+    events will start dropping if the consumer is not keeping up.
+    """
+
+    max_queue_size: int = 100_000
+    """The maximum number of events to queue while waiting for publishing.
+    """
+
+    topic: str = ""
+    """The topic to use for the event publisher. Consumers can subscribe to
+    this topic to receive events.
+    """
+
+    @classmethod
+    def from_cli(cls, cli_value: str) -> "KVEventsConfig":
+        """Parse the CLI value for the event publisher config."""
+        return KVEventsConfig.model_validate_json(cli_value)
+
+
 class CompilationLevel:
     # constants for the levels of the compilation process
     NO_COMPILATION = 0
@@ -3779,6 +3826,7 @@ class VllmConfig:
                                                   init=True)  # type: ignore
     kv_transfer_config: KVTransferConfig = field(default=None,
                                                  init=True)  # type: ignore
+    kv_events_config: Optional[KVEventsConfig] = None
     # some opaque config, only used to provide additional information
     # for the hash computation, mainly used for testing, debugging or out of
     # tree config registration.
@@ -4038,6 +4086,18 @@ class VllmConfig:
             if self.cache_config is not None:
                 self.cache_config.enable_prefix_caching = False
 
+        if (self.kv_events_config
+                and self.kv_events_config.enable_kv_cache_events
+                and not self.cache_config.enable_prefix_caching):
+            logger.warning(
+                "KV cache events are on, but prefix caching is not enabled."
+                "Use --enable-prefix-caching to enable.")
+        if (self.kv_events_config and self.kv_events_config.publisher != "null"
+                and not self.kv_events_config.enable_kv_cache_events):
+            logger.warning("KV cache events are disabled,"
+                           "but the scheduler is configured to publish them."
+                           "Modify KVEventsConfig.enable_kv_cache_events"
+                           "to True to enable.")
         current_platform.check_and_update_config(self)
 
         if not self.instance_id:
diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py
new file mode 100644
index 000000000..960913858
--- /dev/null
+++ b/vllm/distributed/kv_events.py
@@ -0,0 +1,295 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import queue
+import threading
+import time
+from abc import ABC, abstractmethod
+from collections import deque
+from itertools import count
+from queue import Queue
+from typing import Any, Callable, Optional, Union
+
+import msgspec
+import zmq
+
+from vllm.config import KVEventsConfig
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class EventBatch(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False,  # type: ignore[call-arg]
+):
+    ts: float
+    events: list[Any]
+
+
+class KVCacheEvent(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False,  # type: ignore[call-arg]
+        tag=True):
+    """Base class for all KV cache-related events"""
+
+
+class BlockStored(KVCacheEvent):
+    block_hashes: list[int]
+    parent_block_hash: Optional[int]
+    token_ids: list[int]
+    block_size: int
+    lora_id: Optional[int]
+
+
+class BlockRemoved(KVCacheEvent):
+    block_hashes: list[int]
+
+
+class AllBlocksCleared(KVCacheEvent):
+    pass
+
+
+class KVEventBatch(EventBatch):
+    events: list[Union[BlockStored, BlockRemoved, AllBlocksCleared]]
+
+
+class EventPublisher(ABC):
+    """Lightweight publisher for EventBatch batches."""
+
+    @abstractmethod
+    def publish(self, events: EventBatch) -> None:
+        """Emit events in order.
+
+        Implementations should guarantee at-least-once delivery and
+        monotonic ordering (e.g., via sequence numbers).
+        """
+
+    @abstractmethod
+    def shutdown(self) -> None:
+        """Shutdown the publisher."""
+
+
+class NullEventPublisher(EventPublisher):
+    """No-op implementation (default when disabled)."""
+
+    def publish(self, events) -> None:
+        return
+
+    def shutdown(self) -> None:
+        return
+
+
+class ZmqEventPublisher(EventPublisher):
+    """Reliable PUB/ROUTER publisher with an in-memory replay buffer.
+
+    Spawns a separate thread to handle publishing from a queue.
+
+    Parameters
+    ----------
+    endpoint:
+        PUB address. Use ``tcp://*:5557`` to bind or ``tcp://host:5557`` to
+        connect.
+    replay_endpoint:
+        Optional ROUTER address for replay requests. When given, subscribers can
+        request missed batches by sending the starting sequence number as an
+        8-byte big-endian integer.
+    buffer_steps:
+        Number of past batches to keep for replay.
+    hwm:
+        ZeroMQ high-water-mark for PUB socket.
+    max_queue_size:
+        Maximum number of events to buffer in memory.
+    topic:
+        Topic to publish events to.
+    """
+    SHUTDOWN_TIMEOUT: float = 1.0
+    END_SEQ = (-1).to_bytes(8, "big", signed=True)
+
+    def __init__(
+        self,
+        endpoint: str = "tcp://*:5557",
+        replay_endpoint: Optional[str] = None,
+        buffer_steps: int = 10_000,
+        hwm: int = 100_000,
+        max_queue_size: int = 100_000,
+        topic: str = "",
+    ) -> None:
+        # Storage
+        self._event_queue = Queue[Optional[EventBatch]](maxsize=max_queue_size)
+        self._buffer = deque[tuple[int, bytes]](maxlen=buffer_steps)
+
+        # ZMQ sockets
+        self._ctx = zmq.Context.instance()
+        self._pub: Optional[zmq.Socket] = None
+        self._replay: Optional[zmq.Socket] = None
+        self._endpoint = endpoint
+        self._replay_endpoint = replay_endpoint
+        self._hwm = hwm
+
+        # Payload
+        self._seq_gen = count()
+        self._topic_bytes = topic.encode('utf-8')
+
+        # Thread
+        self._running = True
+        logger.info("Starting ZMQ publisher thread")
+
+        self._thread = threading.Thread(target=self._publisher_thread,
+                                        daemon=True,
+                                        name="zmq-publisher")
+        self._thread.start()
+
+    def publish(self, events: EventBatch) -> None:
+        if not self._running:
+            raise RuntimeError("Publisher is closed")
+        self._event_queue.put(events)
+
+    def shutdown(self) -> None:
+        """Stop the publisher thread and clean up resources."""
+        self._running = False
+        self._event_queue.put_nowait(None)
+
+        start = time.time()
+        pending_items = True
+        while pending_items and (time.time() - start < self.SHUTDOWN_TIMEOUT):
+            pending_items = not self._event_queue.empty()
+            if pending_items:
+                time.sleep(0.1)
+
+        if pending_items:
+            logger.warning(
+                "Warning: Queue still has %s items after %s seconds timeout",
+                self._event_queue.qsize(),
+                self.SHUTDOWN_TIMEOUT,
+            )
+
+        if self._thread.is_alive():
+            self._thread.join(timeout=self.SHUTDOWN_TIMEOUT)
+
+        # Clean up ZMQ resources
+        try:
+            if self._pub is not None:
+                self._pub.close(linger=0)
+            if self._replay is not None:
+                self._replay.close(linger=0)
+        finally:
+            pass  # Do not terminate context; other sockets may use it
+
+    def _socket_setup(self) -> None:
+        """Initialize sockets
+        https://pyzmq.readthedocs.io/en/v19.0.0/morethanbindings.html#thread-safety
+        """
+        if self._pub is None:
+            self._pub = self._ctx.socket(zmq.PUB)
+            self._pub.set_hwm(self._hwm)
+            # Heuristic: bind if wildcard / * present, else connect.
+            # bind stable, connect volatile convention
+            if ("*" in self._endpoint or "::" in self._endpoint
+                    or self._endpoint.startswith("ipc://")
+                    or self._endpoint.startswith("inproc://")):
+                self._pub.bind(self._endpoint)
+            else:
+                self._pub.connect(self._endpoint)
+
+        # Set up replay socket: use ROUTER
+        # 1) handles multiple REQ clients (identities)
+        # 2) lets us send back one request → many replies (streamed events)
+        # 3) works in our non‑blocking poll loop alongside PUB
+        if self._replay_endpoint is not None:
+            self._replay = self._ctx.socket(zmq.ROUTER)
+            self._replay.bind(self._replay_endpoint)
+
+    def _publisher_thread(self) -> None:
+        """Background thread that processes the event queue."""
+        self._pack = msgspec.msgpack.Encoder()
+        self._socket_setup()
+
+        assert self._pub is not None  # narrows type for mypy
+
+        while self._running or self._event_queue.qsize() > 0:
+            # --- replay (non-critical) ---------------------------------
+            if self._replay is not None and self._replay.poll(0):
+                try:
+                    self._service_replay()
+                except Exception as e:
+                    logger.exception("Error in replay: %s", e)
+
+            # --- main queue (critical) ---------------------------------
+            try:
+                event = self._event_queue.get(timeout=0.1)
+                if event is None:
+                    break  # Sentinel received, exit thread
+            except queue.Empty:
+                continue
+
+            try:
+                seq = next(self._seq_gen)
+
+                payload = self._pack.encode(event)
+                seq_bytes = seq.to_bytes(8, "big")
+                self._pub.send_multipart(
+                    (self._topic_bytes, seq_bytes, payload))
+
+                self._buffer.append((seq, payload))
+                self._event_queue.task_done()
+
+            except Exception as e:
+                # Publishing failed;  back-off a bit to avoid a tight error loop
+                logger.exception("Error in publisher thread: %s", e)
+                time.sleep(0.1)
+
+    def _service_replay(self) -> None:
+        """If a replay request is waiting, send buffered batches."""
+        assert self._replay is not None  # narrows type for mypy
+
+        frame = self._replay.recv_multipart()
+        if len(frame) != 3:
+            logger.warning("Invalid replay request: %s", frame)
+            return
+        client_id, _, start_seq_bytes = frame
+        start_seq = int.from_bytes(start_seq_bytes, "big")
+
+        for seq, buf in self._buffer:
+            if seq >= start_seq:
+                # [identity, empty_delim, seq_bytes, payload]
+                # (identity, empty_delim) are stripped off by the router
+                # receiving payload is (seq_bytes, payload)
+                self._replay.send_multipart(
+                    (client_id, b"", seq.to_bytes(8, "big"), buf))
+        # Send end of sequence marker
+        # receiving payload is (-1, b""")
+        self._replay.send_multipart((client_id, b"", self.END_SEQ, b""))
+
+
+class EventPublisherFactory:
+    _registry: dict[str, Callable[..., EventPublisher]] = {
+        "null": NullEventPublisher,
+        "zmq": ZmqEventPublisher,
+    }
+
+    @classmethod
+    def register_publisher(cls, name: str,
+                           ctor: Callable[..., EventPublisher]) -> None:
+        if name in cls._registry:
+            raise KeyError(f"publisher '{name}' already registered")
+        cls._registry[name] = ctor
+
+    @classmethod
+    def create(cls, config: Optional[KVEventsConfig]) -> EventPublisher:
+        """Create publisher from a config mapping."""
+        if not config:
+            return NullEventPublisher()
+
+        config_dict = config.model_dump()
+
+        kind = config_dict.pop("publisher", "null")
+        config_dict.pop("enable_kv_cache_events")
+        try:
+            constructor = cls._registry[kind]
+        except KeyError as exc:
+            raise ValueError(f"Unknown event publisher '{kind}'") from exc
+        return constructor(**config_dict)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 4f074fcd1..c7a580cf1 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -19,14 +19,14 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
                          ConfigFormat, ConfigType, DecodingConfig, Device,
                          DeviceConfig, DistributedExecutorBackend,
                          GuidedDecodingBackend, GuidedDecodingBackendV1,
-                         HfOverrides, KVTransferConfig, LoadConfig, LoadFormat,
-                         LoRAConfig, ModelConfig, ModelDType, ModelImpl,
-                         MultiModalConfig, ObservabilityConfig, ParallelConfig,
-                         PoolerConfig, PrefixCachingHashAlgo,
-                         PromptAdapterConfig, SchedulerConfig, SchedulerPolicy,
-                         SpeculativeConfig, TaskOption, TokenizerMode,
-                         TokenizerPoolConfig, VllmConfig, get_attr_docs,
-                         get_field)
+                         HfOverrides, KVEventsConfig, KVTransferConfig,
+                         LoadConfig, LoadFormat, LoRAConfig, ModelConfig,
+                         ModelDType, ModelImpl, MultiModalConfig,
+                         ObservabilityConfig, ParallelConfig, PoolerConfig,
+                         PrefixCachingHashAlgo, PromptAdapterConfig,
+                         SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
+                         TaskOption, TokenizerMode, TokenizerPoolConfig,
+                         VllmConfig, get_attr_docs, get_field)
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -353,6 +353,7 @@ class EngineArgs:
     worker_extension_cls: str = ParallelConfig.worker_extension_cls
 
     kv_transfer_config: Optional[KVTransferConfig] = None
+    kv_events_config: Optional[KVEventsConfig] = None
 
     generation_config: str = ModelConfig.generation_config
     enable_sleep_mode: bool = ModelConfig.enable_sleep_mode
@@ -769,6 +770,10 @@ class EngineArgs:
                             default=None,
                             help='The configurations for distributed KV cache '
                             'transfer. Should be a JSON string.')
+        parser.add_argument('--kv-events-config',
+                            type=KVEventsConfig.from_cli,
+                            default=None,
+                            help='The configurations for event publishing.')
 
         parser.add_argument(
             '--worker-cls',
@@ -1125,6 +1130,7 @@ class EngineArgs:
             prompt_adapter_config=prompt_adapter_config,
             compilation_config=self.compilation_config,
             kv_transfer_config=self.kv_transfer_config,
+            kv_events_config=self.kv_events_config,
             additional_config=self.additional_config,
         )
 
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index 74f3f7852..f2ed183b6 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -3,6 +3,8 @@ from collections import defaultdict
 from collections.abc import Iterable
 from typing import Callable, Optional
 
+from vllm.distributed.kv_events import (AllBlocksCleared, BlockRemoved,
+                                        BlockStored, KVCacheEvent)
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
                                          KVCacheBlock,
@@ -26,7 +28,12 @@ class BlockPool:
         enable_caching: Whether to enable prefix caching.
     """
 
-    def __init__(self, num_gpu_blocks: int, enable_caching: bool):
+    def __init__(
+        self,
+        num_gpu_blocks: int,
+        enable_caching: bool,
+        enable_kv_cache_events: bool = False,
+    ):
         assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
         self.num_gpu_blocks = num_gpu_blocks
         self.enable_caching = enable_caching
@@ -56,6 +63,9 @@ class BlockPool:
         # avoid freeing it.
         self.null_block = self.free_block_queue.popleft()
 
+        self.enable_kv_cache_events = enable_kv_cache_events
+        self.kv_event_queue: list[KVCacheEvent] = []
+
     def get_cached_block(self,
                          block_hash: BlockHashType) -> Optional[KVCacheBlock]:
         """Get a cached block by the block hash, or None if cache miss.
@@ -116,6 +126,9 @@ class BlockPool:
             assert prev_block.block_hash is not None
             prev_block_hash_value = prev_block.block_hash.hash_value
 
+        parent_block_hash = prev_block_hash_value
+        new_hashes: Optional[list[int]] = ([] if self.enable_kv_cache_events
+                                           else None)
         for i, blk in enumerate(new_full_blocks):
             assert blk.block_hash is None
 
@@ -153,8 +166,23 @@ class BlockPool:
             # Update and added the full block to the cache.
             blk.block_hash = block_hash
             self.cached_block_hash_to_block[block_hash][blk.block_id] = blk
+            if new_hashes is not None:
+                new_hashes.append(block_hash.hash_value)
             prev_block_hash_value = block_hash.hash_value
 
+        if self.enable_kv_cache_events:
+            self.kv_event_queue.append(
+                BlockStored(
+                    block_hashes=new_hashes,
+                    parent_block_hash=parent_block_hash,
+                    token_ids=request.
+                    all_token_ids[num_cached_blocks *
+                                  block_size:num_full_blocks * block_size],
+                    block_size=block_size,
+                    lora_id=request.lora_request.id
+                    if request.lora_request else None,
+                ))
+
     def get_new_blocks(self, num_blocks: int) -> list[KVCacheBlock]:
         """Get new blocks from the free block pool.
 
@@ -206,6 +234,9 @@ class BlockPool:
             if len(self.cached_block_hash_to_block[block_hash]) == 0:
                 del self.cached_block_hash_to_block[block_hash]
 
+            if self.enable_kv_cache_events:
+                self.kv_event_queue.append(
+                    BlockRemoved(block_hashes=[block_hash.hash_value]))
             return True
         return False
 
@@ -262,6 +293,10 @@ class BlockPool:
             block.reset_hash()
 
         logger.info("Successfully reset prefix cache")
+
+        if self.enable_kv_cache_events:
+            self.kv_event_queue.append(AllBlocksCleared())
+
         return True
 
     def get_num_free_blocks(self) -> int:
@@ -279,3 +314,15 @@ class BlockPool:
             The KV cache usage (between 0.0 and 1.0).
         """
         return 1.0 - (self.get_num_free_blocks() / self.num_gpu_blocks)
+
+    def take_events(self) -> list[KVCacheEvent]:
+        """Atomically takes all events and clears the queue.
+        
+        Returns:
+            A list of KV cache events.
+        """
+        if not self.enable_kv_cache_events:
+            return []
+        events = self.kv_event_queue
+        self.kv_event_queue = []
+        return events
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 0830d8433..39554bed0 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -4,6 +4,7 @@ from collections import defaultdict
 from collections.abc import Iterable
 from typing import Optional
 
+from vllm.distributed.kv_events import KVCacheEvent
 from vllm.logger import init_logger
 from vllm.utils import cdiv, sha256
 from vllm.v1.core.block_pool import BlockPool
@@ -27,6 +28,7 @@ class KVCacheManager:
         caching_hash_algo: str = "builtin",
         use_eagle: bool = False,
         log_stats: bool = False,
+        enable_kv_cache_events: bool = False,
     ) -> None:
         assert len(kv_cache_config.kv_cache_groups) == 1, (
             "KVCacheManager does not support hybrid models with more than 1 "
@@ -44,7 +46,9 @@ class KVCacheManager:
         # FIXME: make prefix cache stats conditional on log_stats
         self.prefix_cache_stats = PrefixCacheStats() if log_stats else None
 
-        self.block_pool = BlockPool(self.num_gpu_blocks, enable_caching)
+        self.block_pool = BlockPool(self.num_gpu_blocks, enable_caching,
+                                    enable_kv_cache_events)
+
         self.specialized_manager = get_specialized_manager(
             kv_cache_spec=kv_cache_spec,
             block_pool=self.block_pool,
@@ -383,3 +387,11 @@ class KVCacheManager:
         is finished, not when it is preempted.
         """
         self.req_to_block_hashes.pop(request.request_id, None)
+
+    def take_events(self) -> list[KVCacheEvent]:
+        """Take the KV cache events from the block pool.
+
+        Returns:
+            A list of KV cache events.
+        """
+        return self.block_pool.take_events()
diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py
index 1de236d42..0b328f510 100644
--- a/vllm/v1/core/sched/interface.py
+++ b/vllm/v1/core/sched/interface.py
@@ -132,3 +132,8 @@ class SchedulerInterface(ABC):
         The SchedulerStats object is created for every scheduling step.
         """
         raise NotImplementedError
+
+    @abstractmethod
+    def shutdown(self) -> None:
+        """Shutdown the scheduler."""
+        raise NotImplementedError
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 7ebbb4954..ae7280a14 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -8,6 +8,7 @@ from collections.abc import Iterable
 from typing import Optional, Union
 
 from vllm.config import VllmConfig
+from vllm.distributed.kv_events import EventPublisherFactory, KVEventBatch
 from vllm.distributed.kv_transfer.kv_connector.factory import (
     KVConnectorFactory)
 from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorRole
@@ -48,6 +49,7 @@ class Scheduler(SchedulerInterface):
         self.cache_config = vllm_config.cache_config
         self.lora_config = vllm_config.lora_config
         self.kv_cache_config = kv_cache_config
+        self.kv_events_config = vllm_config.kv_events_config
         self.log_stats = log_stats
         self.structured_output_manager = structured_output_manager
 
@@ -62,6 +64,9 @@ class Scheduler(SchedulerInterface):
         self.max_num_scheduled_tokens = \
             self.scheduler_config.max_num_batched_tokens
         self.max_model_len = self.scheduler_config.max_model_len
+        self.enable_kv_cache_events = (
+            self.kv_events_config is not None
+            and self.kv_events_config.enable_kv_cache_events)
 
         # Create KVConnector for the Scheduler. Note that each Worker
         # will have a corresponding KVConnector with Role=WORKER.
@@ -71,6 +76,9 @@ class Scheduler(SchedulerInterface):
             self.connector = KVConnectorFactory.create_connector_v1(
                 config=self.vllm_config, role=KVConnectorRole.SCHEDULER)
 
+        self.kv_event_publisher = EventPublisherFactory.create(
+            self.kv_events_config)
+
         num_gpu_blocks = self.cache_config.num_gpu_blocks
         assert num_gpu_blocks is not None and num_gpu_blocks > 0
 
@@ -132,7 +140,9 @@ class Scheduler(SchedulerInterface):
             enable_caching=self.cache_config.enable_prefix_caching,
             caching_hash_algo=self.cache_config.prefix_caching_hash_algo,
             use_eagle=self.use_eagle,
-            log_stats=self.log_stats)
+            log_stats=self.log_stats,
+            enable_kv_cache_events=self.enable_kv_cache_events,
+        )
 
     def schedule(self) -> SchedulerOutput:
         # NOTE(woosuk) on the scheduling algorithm:
@@ -493,6 +503,11 @@ class Scheduler(SchedulerInterface):
             meta = self.connector.build_connector_meta(scheduler_output)
             scheduler_output.kv_connector_metadata = meta
 
+        events = self.kv_cache_manager.take_events()
+        if events:
+            batch = KVEventBatch(ts=time.time(), events=events)
+            self.kv_event_publisher.publish(batch)
+
         # Advance the number of computed tokens for the request AFTER
         # the request is scheduled.
         # 1. The scheduler_output of the current step has to include the
@@ -843,3 +858,7 @@ class Scheduler(SchedulerInterface):
             num_draft_tokens=num_draft_tokens,
             num_accepted_tokens=num_accepted_tokens)
         return spec_decoding_stats
+
+    def shutdown(self) -> None:
+        if self.kv_event_publisher:
+            self.kv_event_publisher.shutdown()
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 5912318f1..e772615b7 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -259,6 +259,8 @@ class EngineCore:
         self.structured_output_manager.clear_backend()
         if self.model_executor:
             self.model_executor.shutdown()
+        if self.scheduler:
+            self.scheduler.shutdown()
 
     def profile(self, is_start: bool = True):
         self.model_executor.profile(is_start)
-- 
GitLab


From 2990cee95bb74394fb56c6fb6cc994fa80acbc86 Mon Sep 17 00:00:00 2001
From: Chauncey 
Date: Wed, 30 Apr 2025 22:48:21 +0800
Subject: [PATCH 060/461] [Feature] The Qwen3 reasoning parser supports  guided
 decoding (#17466)

Signed-off-by: chaunceyjiang 
---
 vllm/reasoning/qwen3_reasoning_parser.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/vllm/reasoning/qwen3_reasoning_parser.py b/vllm/reasoning/qwen3_reasoning_parser.py
index 78a73011f..f588f4016 100644
--- a/vllm/reasoning/qwen3_reasoning_parser.py
+++ b/vllm/reasoning/qwen3_reasoning_parser.py
@@ -47,6 +47,18 @@ class Qwen3ReasoningParser(ReasoningParser):
                 "Qwen3 reasoning parser could not locate think start/end "
                 "tokens in the tokenizer!")
 
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        return self.think_end_token_id in input_ids
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        """
+        Extract the content after the end tokens
+        """
+        if self.think_end_token_id not in input_ids[:-1]:
+            return []
+        else:
+            return input_ids[input_ids.index(self.think_end_token_id) + 1:]
+
     def extract_reasoning_content_streaming(
         self,
         previous_text: str,
@@ -88,7 +100,6 @@ class Qwen3ReasoningParser(ReasoningParser):
                 # reasoning content continues
                 return DeltaMessage(reasoning_content=delta_text)
         elif self.think_start_token_id in delta_token_ids:
-            logger.info(delta_text)
             if self.think_end_token_id in delta_token_ids:
                 #  in delta,  in delta, extract reasoning content
                 start_index = delta_text.find(self.think_start_token)
-- 
GitLab


From 39317cf42be27fa4a15558910b706f3d58327f94 Mon Sep 17 00:00:00 2001
From: Russell Bryant 
Date: Wed, 30 Apr 2025 11:06:09 -0400
Subject: [PATCH 061/461] [Docs] Add command for running mypy tests from CI
 (#17475)

Signed-off-by: Russell Bryant 
---
 docs/source/contributing/overview.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
index 3dceec1e5..7c4016cae 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -40,6 +40,10 @@ pre-commit install --hook-type pre-commit --hook-type commit-msg
 # You can manually run pre-commit with
 pre-commit run --all-files
 
+# To manually run something from CI that does not run
+# locally by default, you can run:
+pre-commit run mypy-3.9 --hook-stage manual --all-files
+
 # Unit tests
 pytest tests/
 ```
-- 
GitLab


From da4e7687b5d21d405bf229a74e055cc78cbcc06b Mon Sep 17 00:00:00 2001
From: Aaron Pham 
Date: Wed, 30 Apr 2025 11:06:58 -0400
Subject: [PATCH 062/461] [Fix] Support passing args to logger (#17425)

Signed-off-by: Aaron Pham 
---
 vllm/config.py                                | 12 +++---
 vllm/logger.py                                | 17 +++++----
 vllm/lora/punica_wrapper/punica_selector.py   |  3 +-
 vllm/model_executor/custom_op.py              |  6 +--
 .../guided_decoding/xgrammar_decoding.py      |  6 +--
 .../layers/quantization/awq_marlin.py         |  5 ++-
 .../model_loader/weight_utils.py              | 20 +++++-----
 vllm/model_executor/models/chameleon.py       |  8 ++--
 vllm/model_executor/models/olmoe.py           |  9 ++---
 vllm/model_executor/models/qwen2_moe.py       |  9 ++---
 vllm/model_executor/models/qwen3_moe.py       |  9 ++---
 vllm/multimodal/profiling.py                  | 38 ++++++++-----------
 vllm/multimodal/registry.py                   | 12 +++---
 13 files changed, 75 insertions(+), 79 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 5da1ab258..e96d872d6 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -278,7 +278,7 @@ class ModelConfig:
     max_model_len: int = None  # type: ignore
     """Model context length (prompt and output). If unspecified, will be
     automatically derived from the model config.
-    
+
     When passing via `--max-model-len`, supports k/m/g/K/M/G in human-readable
     format. Examples:\n
     - 1k -> 1000\n
@@ -518,11 +518,11 @@ class ModelConfig:
                     self.hf_text_config.sliding_window)
 
                 logger.warning_once(
-                    f"{self.hf_text_config.model_type} has interleaved "
-                    "attention, which is currently not supported by the "
-                    f"{backend} backend. Disabling sliding window and capping "
-                    "the max length to the sliding window size "
-                    f"({sliding_window_len_min}).")
+                    "%s has interleaved attention, which is currently not supported by the %s backend. Disabling sliding window and capping the max length to the sliding window size (%d).",  # noqa: E501
+                    self.hf_text_config.model_type,
+                    backend,
+                    sliding_window_len_min,
+                )
                 self.disable_sliding_window = True
             else:
                 # for a model with interleaved attention,
diff --git a/vllm/logger.py b/vllm/logger.py
index 2b0b9da2d..c162e2e04 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -5,6 +5,7 @@ import json
 import logging
 import os
 import sys
+from collections.abc import Hashable
 from functools import lru_cache, partial
 from logging import Logger
 from logging.config import dictConfig
@@ -52,15 +53,15 @@ DEFAULT_LOGGING_CONFIG = {
 
 
 @lru_cache
-def _print_info_once(logger: Logger, msg: str) -> None:
+def _print_info_once(logger: Logger, msg: str, *args: Hashable) -> None:
     # Set the stacklevel to 2 to print the original caller's line info
-    logger.info(msg, stacklevel=2)
+    logger.info(msg, *args, stacklevel=2)
 
 
 @lru_cache
-def _print_warning_once(logger: Logger, msg: str) -> None:
+def _print_warning_once(logger: Logger, msg: str, *args: Hashable) -> None:
     # Set the stacklevel to 2 to print the original caller's line info
-    logger.warning(msg, stacklevel=2)
+    logger.warning(msg, *args, stacklevel=2)
 
 
 class _VllmLogger(Logger):
@@ -72,19 +73,19 @@ class _VllmLogger(Logger):
         `intel_extension_for_pytorch.utils._logger`.
     """
 
-    def info_once(self, msg: str) -> None:
+    def info_once(self, msg: str, *args: Hashable) -> None:
         """
         As :meth:`info`, but subsequent calls with the same message
         are silently dropped.
         """
-        _print_info_once(self, msg)
+        _print_info_once(self, msg, *args)
 
-    def warning_once(self, msg: str) -> None:
+    def warning_once(self, msg: str, *args: Hashable) -> None:
         """
         As :meth:`warning`, but subsequent calls with the same message
         are silently dropped.
         """
-        _print_warning_once(self, msg)
+        _print_warning_once(self, msg, *args)
 
 
 def _configure_vllm_root_logger() -> None:
diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py
index ad5d4b788..922d6c060 100644
--- a/vllm/lora/punica_wrapper/punica_selector.py
+++ b/vllm/lora/punica_wrapper/punica_selector.py
@@ -15,6 +15,5 @@ def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
     punica_wrapper = punica_wrapper_cls(*args, **kwargs)
     assert punica_wrapper is not None, \
         "the punica_wrapper_qualname(" + punica_wrapper_qualname + ") is wrong."
-    logger.info_once("Using " + punica_wrapper_qualname.rsplit(".", 1)[1] +
-                     ".")
+    logger.info_once("Using %s.", punica_wrapper_qualname.rsplit(".", 1)[1])
     return punica_wrapper
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index dfd052f62..b0d00ee48 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -107,9 +107,9 @@ class CustomOp(nn.Module):
         custom_ops = compilation_config.custom_ops
         if not hasattr(cls, "name"):
             logger.warning_once(
-                f"Custom op {cls.__name__} was not registered, "
-                f"which means it won't appear in the op registry. "
-                f"It will be enabled/disabled based on the global settings.")
+                "Custom op %s was not registered, which means it won't appear in the op registry. It will be enabled/disabled based on the global settings.",  # noqa: E501
+                cls.__name__,
+            )
             return CustomOp.default_on()
 
         enabled = f"+{cls.name}" in custom_ops
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index c63acfdde..eb7b2b74f 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -191,9 +191,9 @@ class GrammarConfig:
 
             if model_with_warn is not None and any_whitespace:
                 logger.info_once(
-                    f"{model_with_warn} model detected, consider setting "
-                    "`disable_any_whitespace` to prevent runaway generation "
-                    "of whitespaces.")
+                    "%s model detected, consider setting `disable_any_whitespace` to prevent runaway generation of whitespaces.",  # noqa: E501
+                    model_with_warn,
+                )
             # Validate the schema and raise ValueError here if it is invalid.
             # This is to avoid exceptions in model execution, which will crash
             # the engine worker process.
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 193e90b85..07d928b59 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -130,8 +130,9 @@ class AWQMarlinConfig(QuantizationConfig):
             # Check if the layer is supported by AWQMarlin.
             if not check_marlin_supports_layer(layer, self.group_size):
                 logger.warning_once(
-                    f"Layer '{prefix}' is not supported by AWQMarlin. "
-                    "Falling back to unoptimized AWQ kernels.")
+                    "Layer '%s' is not supported by AWQMarlin. Falling back to unoptimized AWQ kernels.",  # noqa: E501
+                    prefix,
+                )
                 return AWQConfig.from_config(
                     self.full_config).get_quant_method(layer, prefix)
             return AWQMarlinLinearMethod(self)
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 1bb592f49..37a8491cf 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -464,7 +464,7 @@ def fastsafetensors_weights_iterator(
     hf_weights_files: List[str],
     use_tqdm_on_load: bool,
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
-    """Iterate over the weights in the model safetensor files 
+    """Iterate over the weights in the model safetensor files
     using fastsafetensor library."""
     if torch.distributed.is_initialized():
         pg = torch.distributed.group.WORLD
@@ -716,10 +716,10 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
         remapped_name = name.replace(".kv_scale", ".attn.k_scale")
         if remapped_name not in params_dict:
             logger.warning_once(
-                f"Found kv_scale in the checkpoint (e.g. {name}), "
-                "but not found the expected name in the model "
-                f"(e.g. {remapped_name}). kv_scale is "
-                "not loaded.")
+                "Found kv_scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv_scale is not loaded.",  #  noqa: E501
+                name,
+                remapped_name,
+            )
             return None
         return remapped_name
 
@@ -738,10 +738,12 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
                 remapped_name = name.replace(scale_name, f".attn{scale_name}")
             if remapped_name not in params_dict:
                 logger.warning_once(
-                    f"Found {scale_name} in the checkpoint (e.g. {name}), "
-                    "but not found the expected name in the model "
-                    f"(e.g. {remapped_name}). {scale_name} is "
-                    "not loaded.")
+                    "Found %s in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). %s is not loaded.",  # noqa: E501
+                    scale_name,
+                    name,
+                    remapped_name,
+                    scale_name,
+                )
                 return None
             return remapped_name
 
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index e2c275300..ef8b033f3 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -1111,10 +1111,10 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
                             logger.warning_once(
-                                "Found kv scale in the checkpoint (e.g. "
-                                f"{name}), but not found the expected name in "
-                                f"the model (e.g. {remapped_kv_scale_name}). "
-                                "kv-scale is not loaded.")
+                                "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.",  # noqa: E501
+                                name,
+                                remapped_kv_scale_name,
+                            )
                             continue
                         else:
                             name = remapped_kv_scale_name
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 9bed29d01..e6925e125 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -385,11 +385,10 @@ class OlmoeModel(nn.Module):
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
                             logger.warning_once(
-                                "Found kv scale in the checkpoint "
-                                f"(e.g. {name}), but not found the expected "
-                                f"name in the model "
-                                f"(e.g. {remapped_kv_scale_name}). "
-                                "kv-scale is not loaded.")
+                                "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.",  # noqa: E501
+                                name,
+                                remapped_kv_scale_name,
+                            )
                             continue
                         else:
                             name = remapped_kv_scale_name
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 62696678b..47d90919e 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -462,11 +462,10 @@ class Qwen2MoeModel(nn.Module):
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
                             logger.warning_once(
-                                "Found kv scale in the checkpoint "
-                                f"(e.g. {name}), but not found the expected "
-                                f"name in the model "
-                                f"(e.g. {remapped_kv_scale_name}). "
-                                "kv-scale is not loaded.")
+                                "Found kv_scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv_scale is not loaded.",  #  noqa: E501
+                                name,
+                                remapped_kv_scale_name,
+                            )
                             continue
                         else:
                             name = remapped_kv_scale_name
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 70f9956e3..97acbaa2a 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -459,11 +459,10 @@ class Qwen3MoeModel(nn.Module):
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
                             logger.warning_once(
-                                "Found kv scale in the checkpoint "
-                                f"(e.g. {name}), but not found the expected "
-                                f"name in the model "
-                                f"(e.g. {remapped_kv_scale_name}). "
-                                "kv-scale is not loaded.")
+                                "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.",  # noqa: E501
+                                name,
+                                remapped_kv_scale_name,
+                            )
                             continue
                         else:
                             name = remapped_kv_scale_name
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index a173487c4..b351acc8c 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -215,17 +215,14 @@ class MultiModalProfiler(Generic[_I]):
         elif total_len > seq_len and not envs.VLLM_USE_V1:
             # `max_num_batched_tokens` is defined by `SchedulerConfig`
             logger.warning_once(
-                "The encoder sequence length used for profiling ("
-                f"max_num_batched_tokens / max_num_seqs = {seq_len}) "
-                " is too short "
-                "to hold the multi-modal embeddings in the worst case "
-                f"({total_len} tokens in total, out of which "
-                f"{self._get_mm_num_tokens(mm_inputs)} are reserved for "
-                "multi-modal embeddings). This may cause certain "
-                "multi-modal inputs to fail during inference, even when "
-                "the input text is short. To avoid this, you should "
-                "increase `max_model_len`, reduce `max_num_seqs`, "
-                "and/or reduce `mm_counts`.")
+                "The encoder sequence length used for profiling (max_num_batched_tokens / max_num_seqs = %d) "  # noqa: E501
+                "is too short to hold the multi-modal embeddings in the worst case (%d tokens in total, out of which %s are reserved for multi-modal embeddings). "  # noqa: E501
+                "This may cause certain multi-modal inputs to fail during inference, even when the input text is short. "  # noqa: E501
+                "To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.",  # noqa: E501
+                seq_len,
+                total_len,
+                str(self._get_mm_num_tokens(mm_inputs)),
+            )
 
         return DummyEncoderData(encoder_prompt_token_ids)
 
@@ -243,17 +240,14 @@ class MultiModalProfiler(Generic[_I]):
         if total_len > seq_len and not envs.VLLM_USE_V1:
             # `max_num_batched_tokens` is defined by `SchedulerConfig`
             logger.warning_once(
-                "The sequence length used for profiling ("
-                f"max_num_batched_tokens / max_num_seqs = {seq_len}) "
-                "is too short "
-                "to hold the multi-modal embeddings in the worst case "
-                f"({total_len} tokens in total, out of which "
-                f"{self._get_mm_num_tokens(mm_inputs)} are reserved for "
-                "multi-modal embeddings). This may cause certain "
-                "multi-modal inputs to fail during inference, even when "
-                "the input text is short. To avoid this, you should "
-                "increase `max_model_len`, reduce `max_num_seqs`, "
-                "and/or reduce `mm_counts`.")
+                "The sequence length used for profiling (max_num_batched_tokens / max_num_seqs = %d) "  # noqa: E501
+                "is too short to hold the multi-modal embeddings in the worst case (%d tokens in total, out of which %s are reserved for multi-modal embeddings). "  # noqa: E501
+                "This may cause certain multi-modal inputs to fail during inference, even when the input text is short. "  # noqa: E501
+                "To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.",  # noqa: E501
+                seq_len,
+                total_len,
+                str(self._get_mm_num_tokens(mm_inputs)),
+            )
 
         if total_len < seq_len:
             prompt_token_ids.extend([0] * (seq_len - total_len))
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 68598b941..9d2b4e486 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -100,7 +100,7 @@ class MultiModalRegistry:
         model_config: "ModelConfig",
     ) -> Mapping[str, int]:
         """
-        Get the maximum number of tokens per data item from each modality based 
+        Get the maximum number of tokens per data item from each modality based
         on underlying model configuration.
         """
         if not model_config.is_multimodal_model:
@@ -126,11 +126,11 @@ class MultiModalRegistry:
     ) -> Mapping[str, int]:
         """
         Get the maximum number of tokens per data item from each modality based
-        on underlying model configuration, excluding modalities that user 
+        on underlying model configuration, excluding modalities that user
         explicitly disabled via `limit_mm_per_prompt`.
 
         Note:
-            This is currently directly used only in V1 for profiling the memory 
+            This is currently directly used only in V1 for profiling the memory
             usage of a model.
         """
         mm_limits = self.get_mm_limits_per_prompt(model_config)
@@ -316,7 +316,9 @@ class MultiModalRegistry:
         token_ids = dummy_data.prompt_token_ids
         if len(token_ids) < seq_len:
             logger.warning_once(
-                f"Expected at least {seq_len} dummy encoder tokens for "
-                f"profiling, but found {len(token_ids)} tokens instead.")
+                "Expected at least %d dummy encoder tokens for profiling, but found %d tokens instead.",  # noqa: E501
+                seq_len,
+                len(token_ids),
+            )
 
         return dummy_data
-- 
GitLab


From 739e03b3449a7f3b0a81ebc30b9555305d914e2d Mon Sep 17 00:00:00 2001
From: Pete Savage 
Date: Wed, 30 Apr 2025 16:08:37 +0100
Subject: [PATCH 063/461] [Bugfix] Fixed mistral tokenizer path when pointing
 to file (#17457)

Signed-off-by: Pete Savage 
---
 vllm/transformers_utils/tokenizers/mistral.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 6d4655781..3db7a0a5c 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -227,6 +227,7 @@ class MistralTokenizer(TokenizerBase):
         else:
             assert Path(
                 path_or_repo_id).is_file(), f"Invalid path: {path_or_repo_id}"
+            tokenizer_file = str(Path(path_or_repo_id))
 
         from mistral_common.tokens.tokenizers.mistral import (
             MistralTokenizer as PublicMistralTokenizer)
-- 
GitLab


From 947f2f5375571a59fd509fccf5894f932240203e Mon Sep 17 00:00:00 2001
From: Russell Bryant 
Date: Wed, 30 Apr 2025 12:10:54 -0400
Subject: [PATCH 064/461] [V1] Allow turning off pickle fallback in
 vllm.v1.serial_utils (#17427)

Signed-off-by: Russell Bryant 
Co-authored-by: Cyrus Leung 
---
 tests/v1/test_serial_utils.py | 98 +++++++++++++++++++++++++++++++++++
 vllm/v1/serial_utils.py       | 21 +++++---
 2 files changed, 113 insertions(+), 6 deletions(-)

diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py
index b55018ae8..d1271b210 100644
--- a/tests/v1/test_serial_utils.py
+++ b/tests/v1/test_serial_utils.py
@@ -5,6 +5,7 @@ from typing import Optional
 
 import msgspec
 import numpy as np
+import pytest
 import torch
 
 from vllm.multimodal.inputs import (MultiModalBatchedField,
@@ -196,3 +197,100 @@ def assert_equal(obj1: MyType, obj2: MyType):
     assert torch.equal(obj1.large_non_contig_tensor,
                        obj2.large_non_contig_tensor)
     assert torch.equal(obj1.empty_tensor, obj2.empty_tensor)
+
+
+@pytest.mark.parametrize("allow_pickle", [True, False])
+def test_dict_serialization(allow_pickle: bool):
+    """Test encoding and decoding of a generic Python object using pickle."""
+    encoder = MsgpackEncoder(allow_pickle=allow_pickle)
+    decoder = MsgpackDecoder(allow_pickle=allow_pickle)
+
+    # Create a sample Python object
+    obj = {"key": "value", "number": 42}
+
+    # Encode the object
+    encoded = encoder.encode(obj)
+
+    # Decode the object
+    decoded = decoder.decode(encoded)
+
+    # Verify the decoded object matches the original
+    assert obj == decoded, "Decoded object does not match the original object."
+
+
+@pytest.mark.parametrize("allow_pickle", [True, False])
+def test_tensor_serialization(allow_pickle: bool):
+    """Test encoding and decoding of a torch.Tensor."""
+    encoder = MsgpackEncoder(allow_pickle=allow_pickle)
+    decoder = MsgpackDecoder(torch.Tensor, allow_pickle=allow_pickle)
+
+    # Create a sample tensor
+    tensor = torch.rand(10, 10)
+
+    # Encode the tensor
+    encoded = encoder.encode(tensor)
+
+    # Decode the tensor
+    decoded = decoder.decode(encoded)
+
+    # Verify the decoded tensor matches the original
+    assert torch.allclose(
+        tensor, decoded), "Decoded tensor does not match the original tensor."
+
+
+@pytest.mark.parametrize("allow_pickle", [True, False])
+def test_numpy_array_serialization(allow_pickle: bool):
+    """Test encoding and decoding of a numpy array."""
+    encoder = MsgpackEncoder(allow_pickle=allow_pickle)
+    decoder = MsgpackDecoder(np.ndarray, allow_pickle=allow_pickle)
+
+    # Create a sample numpy array
+    array = np.random.rand(10, 10)
+
+    # Encode the numpy array
+    encoded = encoder.encode(array)
+
+    # Decode the numpy array
+    decoded = decoder.decode(encoded)
+
+    # Verify the decoded array matches the original
+    assert np.allclose(
+        array,
+        decoded), "Decoded numpy array does not match the original array."
+
+
+class CustomClass:
+
+    def __init__(self, value):
+        self.value = value
+
+    def __eq__(self, other):
+        return isinstance(other, CustomClass) and self.value == other.value
+
+
+def test_custom_class_serialization_allowed_with_pickle():
+    """Test that serializing a custom class succeeds when allow_pickle=True."""
+    encoder = MsgpackEncoder(allow_pickle=True)
+    decoder = MsgpackDecoder(CustomClass, allow_pickle=True)
+
+    obj = CustomClass("test_value")
+
+    # Encode the custom class
+    encoded = encoder.encode(obj)
+
+    # Decode the custom class
+    decoded = decoder.decode(encoded)
+
+    # Verify the decoded object matches the original
+    assert obj == decoded, "Decoded object does not match the original object."
+
+
+def test_custom_class_serialization_disallowed_without_pickle():
+    """Test that serializing a custom class fails when allow_pickle=False."""
+    encoder = MsgpackEncoder(allow_pickle=False)
+
+    obj = CustomClass("test_value")
+
+    with pytest.raises(TypeError):
+        # Attempt to encode the custom class
+        encoder.encode(obj)
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index a3ad8cb92..e00ecde66 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -47,7 +47,9 @@ class MsgpackEncoder:
     via dedicated messages. Note that this is a per-tensor limit.
     """
 
-    def __init__(self, size_threshold: Optional[int] = None):
+    def __init__(self,
+                 size_threshold: Optional[int] = None,
+                 allow_pickle: bool = True):
         if size_threshold is None:
             size_threshold = envs.VLLM_MSGPACK_ZERO_COPY_THRESHOLD
         self.encoder = msgpack.Encoder(enc_hook=self.enc_hook)
@@ -56,6 +58,7 @@ class MsgpackEncoder:
         # pass custom data to the hook otherwise.
         self.aux_buffers: Optional[list[bytestr]] = None
         self.size_threshold = size_threshold
+        self.allow_pickle = allow_pickle
 
     def encode(self, obj: Any) -> Sequence[bytestr]:
         try:
@@ -105,6 +108,9 @@ class MsgpackEncoder:
                     for itemlist in mm._items_by_modality.values()
                     for item in itemlist]
 
+        if not self.allow_pickle:
+            raise TypeError(f"Object of type {type(obj)} is not serializable")
+
         if isinstance(obj, FunctionType):
             # `pickle` is generally faster than cloudpickle, but can have
             # problems serializing methods.
@@ -179,12 +185,13 @@ class MsgpackDecoder:
     not thread-safe when encoding tensors / numpy arrays.
     """
 
-    def __init__(self, t: Optional[Any] = None):
+    def __init__(self, t: Optional[Any] = None, allow_pickle: bool = True):
         args = () if t is None else (t, )
         self.decoder = msgpack.Decoder(*args,
                                        ext_hook=self.ext_hook,
                                        dec_hook=self.dec_hook)
         self.aux_buffers: Sequence[bytestr] = ()
+        self.allow_pickle = allow_pickle
 
     def decode(self, bufs: Union[bytestr, Sequence[bytestr]]) -> Any:
         if isinstance(bufs, (bytes, bytearray, memoryview, zmq.Frame)):
@@ -265,10 +272,12 @@ class MsgpackDecoder:
     def ext_hook(self, code: int, data: memoryview) -> Any:
         if code == CUSTOM_TYPE_RAW_VIEW:
             return data
-        if code == CUSTOM_TYPE_PICKLE:
-            return pickle.loads(data)
-        if code == CUSTOM_TYPE_CLOUDPICKLE:
-            return cloudpickle.loads(data)
+
+        if self.allow_pickle:
+            if code == CUSTOM_TYPE_PICKLE:
+                return pickle.loads(data)
+            if code == CUSTOM_TYPE_CLOUDPICKLE:
+                return cloudpickle.loads(data)
 
         raise NotImplementedError(
             f"Extension type code {code} is not supported")
-- 
GitLab


From 0b7e701dd40f4dd4b1f53afc31f087f8bb9e1ae8 Mon Sep 17 00:00:00 2001
From: Michael Goin 
Date: Wed, 30 Apr 2025 10:34:02 -0600
Subject: [PATCH 065/461] [Docs] Update optimization.md doc (#17482)

Signed-off-by: mgoin 
---
 docs/source/performance/optimization.md | 187 ++++++++++++++++++++----
 1 file changed, 155 insertions(+), 32 deletions(-)

diff --git a/docs/source/performance/optimization.md b/docs/source/performance/optimization.md
index ccbe8a367..4160f0784 100644
--- a/docs/source/performance/optimization.md
+++ b/docs/source/performance/optimization.md
@@ -2,65 +2,188 @@
 
 # Optimization and Tuning
 
+This guide covers optimization strategies and performance tuning for vLLM V1.
+
 ## Preemption
 
 Due to the auto-regressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests.
-The vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes
-available again. When this occurs, the following warning is printed:
+In such cases, vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes
+available again. When this occurs, you may see the following warning:
 
 ```text
-WARNING 05-09 00:49:33 scheduler.py:1057 Sequence group 0 is preempted by PreemptionMode.SWAP mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1
+WARNING 05-09 00:49:33 scheduler.py:1057 Sequence group 0 is preempted by PreemptionMode.RECOMPUTE mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1
 ```
 
 While this mechanism ensures system robustness, preemption and recomputation can adversely affect end-to-end latency.
-If you frequently encounter preemptions from the vLLM engine, consider the following actions:
+If you frequently encounter preemptions, consider the following actions:
+
+- Increase `gpu_memory_utilization`. vLLM pre-allocates GPU cache using this percentage of memory. By increasing utilization, you can provide more KV cache space.
+- Decrease `max_num_seqs` or `max_num_batched_tokens`. This reduces the number of concurrent requests in a batch, thereby requiring less KV cache space.
+- Increase `tensor_parallel_size`. This shards model weights across GPUs, allowing each GPU to have more memory available for KV cache. However, increasing this value may cause excessive synchronization overhead.
+- Increase `pipeline_parallel_size`. This distributes model layers across GPUs, reducing the memory needed for model weights on each GPU, indirectly leaving more memory available for KV cache. However, increasing this value may cause latency penalties.
 
-- Increase `gpu_memory_utilization`. The vLLM pre-allocates GPU cache by using gpu_memory_utilization% of memory. By increasing this utilization, you can provide more KV cache space.
-- Decrease `max_num_seqs` or `max_num_batched_tokens`. This can reduce the number of concurrent requests in a batch, thereby requiring less KV cache space.
-- Increase `tensor_parallel_size`. This approach shards model weights, so each GPU has more memory available for KV cache.
-- Increase `pipeline_parallel_size`. This approach distributes model layers across GPUs, reducing the memory needed for model weights on each GPU, which indirectly leaves more memory available for KV cache.
+You can monitor the number of preemption requests through Prometheus metrics exposed by vLLM. Additionally, you can log the cumulative number of preemption requests by setting `disable_log_stats=False`.
 
-You can also monitor the number of preemption requests through Prometheus metrics exposed by the vLLM. Additionally, you can log the cumulative number of preemption requests by setting disable_log_stats=False.
+In vLLM V1, the default preemption mode is `RECOMPUTE` rather than `SWAP`, as recomputation has lower overhead in the V1 architecture.
 
 (chunked-prefill)=
 
 ## Chunked Prefill
 
-vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests.
+Chunked prefill allows vLLM to process large prefills in smaller chunks and batch them together with decode requests. This feature helps improve both throughput and latency by better balancing compute-bound (prefill) and memory-bound (decode) operations.
+
+In vLLM V1, **chunked prefill is always enabled by default**. This is different from vLLM V0, where it was conditionally enabled based on model characteristics.
+
+With chunked prefill enabled, the scheduling policy prioritizes decode requests. It batches all pending decode requests before scheduling any prefill operations. When there are available tokens in the `max_num_batched_tokens` budget, it schedules pending prefills. If a pending prefill request cannot fit into `max_num_batched_tokens`, it automatically chunks it.
+
+This policy has two benefits:
+
+- It improves ITL and generation decode because decode requests are prioritized.
+- It helps achieve better GPU utilization by locating compute-bound (prefill) and memory-bound (decode) requests to the same batch.
 
-You can enable the feature by specifying `--enable-chunked-prefill` in the command line or setting `enable_chunked_prefill=True` in the LLM constructor.
+### Performance Tuning with Chunked Prefill
+
+You can tune the performance by adjusting `max_num_batched_tokens`:
+
+- Smaller values (e.g., 2048) achieve better inter-token latency (ITL) because there are fewer prefills slowing down decodes.
+- Higher values achieve better time to first token (TTFT) as you can process more prefill tokens in a batch.
+- For optimal throughput, we recommend setting `max_num_batched_tokens > 8096` especially for smaller models on large GPUs.
+- If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the V0 default scheduling policy (except that it still prioritizes decodes).
 
 ```python
 from vllm import LLM
 
-llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True)
-# Set max_num_batched_tokens to tune performance.
-# NOTE: 2048 is the default max_num_batched_tokens for chunked prefill.
-# llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=2048)
+# Set max_num_batched_tokens to tune performance
+llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", max_num_batched_tokens=16384)
 ```
 
-By default, vLLM scheduler prioritizes prefills and doesn't batch prefill and decode to the same batch.
-This policy optimizes the TTFT (time to the first token), but incurs slower ITL (inter token latency) and inefficient GPU utilization.
+See related papers for more details ( or ).
 
-Once chunked prefill is enabled, the policy is changed to prioritize decode requests.
-It batches all pending decode requests to the batch before scheduling any prefill.
-When there are available token_budget (`max_num_batched_tokens`), it schedules pending prefills.
-If a last pending prefill request cannot fit into `max_num_batched_tokens`, it chunks it.
+## Parallelism Strategies
 
-This policy has two benefits:
+vLLM supports multiple parallelism strategies that can be combined to optimize performance across different hardware configurations.
 
-- It improves ITL and generation decode because decode requests are prioritized.
-- It helps achieve better GPU utilization by locating compute-bound (prefill) and memory-bound (decode) requests to the same batch.
+### Tensor Parallelism (TP)
 
-You can tune the performance by changing `max_num_batched_tokens`. By default, it is set to 2048.
-Smaller `max_num_batched_tokens` achieves better ITL because there are fewer prefills interrupting decodes.
-Higher `max_num_batched_tokens` achieves better TTFT as you can put more prefill to the batch.
+Tensor parallelism shards model parameters across multiple GPUs within each model layer. This is the most common strategy for large model inference within a single node.
 
-- If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes).
-- Note that the default value (2048) of `max_num_batched_tokens` is optimized for ITL, and it may have lower throughput than the default scheduler.
+**When to use:**
 
-We recommend you set `max_num_batched_tokens > 2048` for throughput.
+- When the model is too large to fit on a single GPU
+- When you need to reduce memory pressure per GPU to allow more KV cache space for higher throughput
 
-See related papers for more details ( or ).
+```python
+from vllm import LLM
+
+# Split model across 4 GPUs
+llm = LLM(model="meta-llama/Llama-3.3-70B-Instruct", tensor_parallel_size=4)
+```
+
+For models that are too large to fit on a single GPU (like 70B parameter models), tensor parallelism is essential.
+
+### Pipeline Parallelism (PP)
+
+Pipeline parallelism distributes model layers across multiple GPUs. Each GPU processes different parts of the model in sequence.
+
+**When to use:**
+
+- When you've already maxed out efficient tensor parallelism but need to distribute the model further, or across nodes
+- For very deep and narrow models where layer distribution is more efficient than tensor sharding
+
+Pipeline parallelism can be combined with tensor parallelism for very large models:
+
+```python
+from vllm import LLM
+
+# Combine pipeline and tensor parallelism
+llm = LLM(
+    model="meta-llama/Llama-3.3-70B-Instruct,
+    tensor_parallel_size=4,
+    pipeline_parallel_size=2
+)
+```
+
+### Expert Parallelism (EP)
+
+Expert parallelism is a specialized form of parallelism for Mixture of Experts (MoE) models, where different expert networks are distributed across GPUs.
+
+**When to use:**
 
-Please try out this feature and let us know your feedback via GitHub issues!
+- Specifically for MoE models (like DeepSeekV3, Qwen3MoE, Llama-4)
+- When you want to balance the expert computation load across GPUs
+
+Expert parallelism is enabled by setting `enable_expert_parallel=True`, which will use expert parallelism instead of tensor parallelism for MoE layers.
+It will use the same degree of parallelism as what you have set for tensor parallelism.
+
+### Data Parallelism (DP)
+
+Data parallelism replicates the entire model across multiple GPU sets and processes different batches of requests in parallel.
+
+**When to use:**
+
+- When you have enough GPUs to replicate the entire model
+- When you need to scale throughput rather than model size
+- In multi-user environments where isolation between request batches is beneficial
+
+Data parallelism can be combined with the other parallelism strategies and is set by `data_parallel_size=N`.
+Note that MoE layers will be sharded according to the product of the tensor parallel size and data parallel size.
+
+## Reducing Memory Usage
+
+If you encounter out-of-memory issues, consider these strategies:
+
+### Context Length and Batch Size
+
+You can reduce memory usage by limiting the context length and batch size:
+
+```python
+from vllm import LLM
+
+llm = LLM(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    max_model_len=2048,  # Limit context window
+    max_num_seqs=4       # Limit batch size
+)
+```
+
+### Adjust CUDA Graph Compilation
+
+CUDA graph compilation in V1 uses more memory than in V0. You can reduce memory usage by adjusting the compilation level:
+
+```python
+from vllm import LLM
+from vllm.config import CompilationConfig, CompilationLevel
+
+llm = LLM(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        cudagraph_capture_sizes=[1, 2, 4, 8]  # Capture fewer batch sizes
+    )
+)
+```
+
+Or, if you are not concerned about latency or overall performance, disable CUDA graph compilation entirely with `enforce_eager=True`:
+
+```python
+from vllm import LLM
+
+llm = LLM(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    enforce_eager=True  # Disable CUDA graph compilation
+)
+```
+
+### Multimodal Models
+
+For multi-modal models, you can reduce memory usage by limiting the number of images/videos per request:
+
+```python
+from vllm import LLM
+
+# Accept up to 2 images per prompt
+llm = LLM(
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
+    limit_mm_per_prompt={"image": 2}
+)
+```
-- 
GitLab


From d586ddc691eadb7f5d8eb390da8035e5cb6c30d9 Mon Sep 17 00:00:00 2001
From: zh Wang 
Date: Thu, 1 May 2025 00:51:05 +0800
Subject: [PATCH 066/461] [BugFix] Fix authorization of
 openai_transcription_client.py (#17321)

Signed-off-by: zh Wang 
---
 examples/online_serving/openai_transcription_client.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py
index 5fcb7c526..66e622672 100644
--- a/examples/online_serving/openai_transcription_client.py
+++ b/examples/online_serving/openai_transcription_client.py
@@ -46,11 +46,15 @@ async def stream_openai_response():
         "model": "openai/whisper-large-v3",
     }
     url = openai_api_base + "/audio/transcriptions"
+    headers = {"Authorization": f"Bearer {openai_api_key}"}
     print("transcription result:", end=' ')
     async with httpx.AsyncClient() as client:
         with open(str(winning_call), "rb") as f:
-            async with client.stream('POST', url, files={'file': f},
-                                     data=data) as response:
+            async with client.stream('POST',
+                                     url,
+                                     files={'file': f},
+                                     data=data,
+                                     headers=headers) as response:
                 async for line in response.aiter_lines():
                     # Each line is a JSON object prefixed with 'data: '
                     if line:
-- 
GitLab


From 584f5fb4c6d96365a3bfa8594115bc02744f2096 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Wed, 30 Apr 2025 12:59:06 -0400
Subject: [PATCH 067/461] [Bugfix][ROCm] Restrict ray version due to a breaking
 release (#17480)

Signed-off-by: Gregory Shtrasberg 
---
 requirements/rocm.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index 4df92aab3..0df56b258 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -9,7 +9,7 @@ awscli
 boto3
 botocore
 datasets
-ray >= 2.10.0
+ray>=2.10.0,<2.45.0
 peft
 pytest-asyncio
 tensorizer>=2.9.0
-- 
GitLab


From 2ac74d098ef7b8748db0cdaa255eeceb5cdd5366 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Thu, 1 May 2025 01:02:41 +0800
Subject: [PATCH 068/461] [doc] add install tips (#17373)

Signed-off-by: reidliu41 
Co-authored-by: reidliu41 
---
 docs/source/features/quantization/fp8.md           | 14 +++++++-------
 docs/source/features/quantization/int4.md          |  8 +++++++-
 docs/source/features/quantization/int8.md          |  8 +++++++-
 .../features/quantization/quantized_kvcache.md     |  2 +-
 docs/source/features/quantization/quark.md         |  7 +++++++
 5 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/docs/source/features/quantization/fp8.md b/docs/source/features/quantization/fp8.md
index a62e0124b..b90bb49ef 100644
--- a/docs/source/features/quantization/fp8.md
+++ b/docs/source/features/quantization/fp8.md
@@ -44,6 +44,12 @@ To produce performant FP8 quantized models with vLLM, you'll need to install the
 pip install llmcompressor
 ```
 
+Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
+
+```console
+pip install vllm lm-eval==0.4.4
+```
+
 ## Quantization Process
 
 The quantization process involves three main steps:
@@ -86,7 +92,7 @@ recipe = QuantizationModifier(
 # Apply the quantization algorithm.
 oneshot(model=model, recipe=recipe)
 
-# Save the model.
+# Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
 SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
 model.save_pretrained(SAVE_DIR)
 tokenizer.save_pretrained(SAVE_DIR)
@@ -94,12 +100,6 @@ tokenizer.save_pretrained(SAVE_DIR)
 
 ### 3. Evaluating Accuracy
 
-Install `vllm` and `lm-evaluation-harness`:
-
-```console
-pip install vllm lm-eval==0.4.4
-```
-
 Load and run the model in `vllm`:
 
 ```python
diff --git a/docs/source/features/quantization/int4.md b/docs/source/features/quantization/int4.md
index f8939e5bf..be48788a4 100644
--- a/docs/source/features/quantization/int4.md
+++ b/docs/source/features/quantization/int4.md
@@ -18,6 +18,12 @@ To use INT4 quantization with vLLM, you'll need to install the [llm-compressor](
 pip install llmcompressor
 ```
 
+Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
+
+```console
+pip install vllm lm-eval==0.4.4
+```
+
 ## Quantization Process
 
 The quantization process involves four main steps:
@@ -87,7 +93,7 @@ oneshot(
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-# Save the compressed model
+# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
 SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/docs/source/features/quantization/int8.md b/docs/source/features/quantization/int8.md
index b381f34bc..d6ddca18e 100644
--- a/docs/source/features/quantization/int8.md
+++ b/docs/source/features/quantization/int8.md
@@ -19,6 +19,12 @@ To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](
 pip install llmcompressor
 ```
 
+Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
+
+```console
+pip install vllm lm-eval==0.4.4
+```
+
 ## Quantization Process
 
 The quantization process involves four main steps:
@@ -91,7 +97,7 @@ oneshot(
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-# Save the compressed model
+# Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
 SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/docs/source/features/quantization/quantized_kvcache.md b/docs/source/features/quantization/quantized_kvcache.md
index 9f36c2949..86e6354ec 100644
--- a/docs/source/features/quantization/quantized_kvcache.md
+++ b/docs/source/features/quantization/quantized_kvcache.md
@@ -126,7 +126,7 @@ oneshot(
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-# Save quantized model
+# Save quantized model: Llama-3.1-8B-Instruct-FP8-KV
 SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/docs/source/features/quantization/quark.md b/docs/source/features/quantization/quark.md
index 935ee37a8..955890dbc 100644
--- a/docs/source/features/quantization/quark.md
+++ b/docs/source/features/quantization/quark.md
@@ -19,6 +19,12 @@ pip install amd-quark
 You can refer to [Quark installation guide](https://quark.docs.amd.com/latest/install.html)
 for more installation details.
 
+Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
+
+```console
+pip install vllm lm-eval==0.4.4
+```
+
 ## Quantization Process
 
 After installing Quark, we will use an example to illustrate how to use Quark.  
@@ -150,6 +156,7 @@ LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
 export_config = ExporterConfig(json_export_config=JsonExporterConfig())
 export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
 
+# Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant
 EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
 exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
 with torch.no_grad():
-- 
GitLab


From 42d9a2c4c70232204c799794be43164d19233fa1 Mon Sep 17 00:00:00 2001
From: David Xia 
Date: Wed, 30 Apr 2025 13:03:20 -0400
Subject: [PATCH 069/461] doc: fix bug report Github template formatting
 (#17486)

Signed-off-by: David Xia 
---
 .github/ISSUE_TEMPLATE/400-bug-report.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/400-bug-report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml
index b96ab4074..637d2dd11 100644
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@@ -21,12 +21,12 @@ body:
       It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
     value: |
       
- The output of `python collect_env.py` + The output of python collect_env.py ```text Your output of `python collect_env.py` here ``` - +
validations: required: true -- GitLab From 81ecf425f0a645e6d6f1c339ba79ef37e95a9569 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Thu, 1 May 2025 02:25:53 +0800 Subject: [PATCH 070/461] [v1][Spec Decode] Make sliding window compatible with eagle prefix caching (#17398) Signed-off-by: Chen Zhang --- tests/v1/core/test_prefix_caching.py | 64 +++++++++++++++++++++-- tests/v1/core/test_specialized_manager.py | 8 ++- vllm/v1/core/kv_cache_manager.py | 8 +-- vllm/v1/core/specialized_manager.py | 39 ++++++++++---- 4 files changed, 96 insertions(+), 23 deletions(-) diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index af0fef89d..4c05e0b87 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -15,7 +15,7 @@ from vllm.v1.core.kv_cache_manager import KVCacheManager, Request from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock, hash_block_tokens) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, - KVCacheGroupSpec) + KVCacheGroupSpec, SlidingWindowSpec) def make_request(request_id, @@ -863,11 +863,11 @@ def test_eagle_enabled_removes_last_block(): req_eagle = make_request("eagle_divisible", token_ids) computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle) - # Should retain 2 blocks: + # Should retain 1 block: # 1. Original 3 blocks → pop last hash → 2 matched blocks - # 2. last_block_hash is not None → Eagle pop is not SKIPPED + # 2. drop last matched block → 1 remaining block assert len(computed_blocks) == 1 - assert num_tokens == 1 * block_size # 32 tokens + assert num_tokens == 1 * block_size # 16 tokens def test_eagle_with_partial_blocks(): @@ -894,3 +894,59 @@ def test_eagle_with_partial_blocks(): # Original match: 2 full blocks → Eagle removes 1 → 1 remaining assert len(computed_blocks) == 1 assert num_tokens == 1 * block_size + + +def test_eagle_with_sliding_window(): + """Test Eagle behavior with sliding window.""" + block_size = 16 + sliding_window_spec = SlidingWindowSpec( + block_size=block_size, + num_kv_heads=1, + head_size=1, + dtype=torch.float32, + sliding_window=block_size, + use_mla=False, + ) + manager = KVCacheManager( + KVCacheConfig( + num_blocks=10, + tensors={}, + kv_cache_groups=[KVCacheGroupSpec(['layer'], sliding_window_spec)], + ), + max_model_len=8192, + enable_caching=True, + use_eagle=True, + ) + + # 2 full blocks + 5 tokens (non-divisible length) + token_ids = [0] * (2 * block_size + 5) + req = make_request("partial_block_test", token_ids) + + # Prime the cache + computed_blocks, _ = manager.get_computed_blocks(req) + manager.allocate_slots(req, len(token_ids), computed_blocks) + # record the block hash of the first block in the request for later use + block_hash_first_block = manager.req_to_block_hashes[req.request_id][0] + assert block_hash_first_block is not None + manager.free(req) + + # New request with Eagle enabled + req_eagle = make_request("partial_eagle", token_ids) + computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle) + # Original match: 2 full blocks → Eagle removes 1 → 1 remaining + assert len(computed_blocks) == 1 + assert num_tokens == 1 * block_size + + # Evict the first block in the request + assert manager.block_pool.get_cached_block( + block_hash_first_block) is not None + manager.block_pool.cached_block_hash_to_block.pop(block_hash_first_block) + + # New request + req_after_evict = make_request("partial_eagle_after_evict", token_ids) + computed_blocks, num_tokens = manager.get_computed_blocks(req_after_evict) + # Cache miss. The only hit prefix is [NULL_BLOCK, BLOCK_2] if eagle is + # not considered. But after dropping the last matched block due to eagle, + # there will be no matched prefix. + assert len(computed_blocks) == 0 + assert num_tokens == 0 diff --git a/tests/v1/core/test_specialized_manager.py b/tests/v1/core/test_specialized_manager.py index 9b4ab5fa8..595c8608f 100644 --- a/tests/v1/core/test_specialized_manager.py +++ b/tests/v1/core/test_specialized_manager.py @@ -19,7 +19,9 @@ def test_sliding_window_possible_cached_prefix(): ) block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True) - manager = SlidingWindowManager(sliding_window_spec, block_pool) + manager = SlidingWindowManager(sliding_window_spec, + block_pool, + use_eagle=False) def run_one_case(block_is_cached, expect_length): block_hash_list = [ @@ -79,7 +81,9 @@ def test_sliding_window_remove_skipped_blocks(): block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True) - manager = SlidingWindowManager(sliding_window_spec, block_pool) + manager = SlidingWindowManager(sliding_window_spec, + block_pool, + use_eagle=False) null_block_id = block_pool.null_block.block_id diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 39554bed0..cb13a5b7a 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -52,6 +52,7 @@ class KVCacheManager: self.specialized_manager = get_specialized_manager( kv_cache_spec=kv_cache_spec, block_pool=self.block_pool, + use_eagle=self.use_eagle, ) # Mapping from request ID to blocks to track the blocks allocated @@ -141,13 +142,6 @@ class KVCacheManager: computed_blocks = ( self.specialized_manager.find_longest_cache_hit(block_hashes)) - if self.use_eagle and len(computed_blocks) > 0: - # Drop the last matched block if (1) eagle is enabled and - # (2) there is a cache hit. - # This is to recompute the last block to get the required - # hidden states for eagle drafting head. - computed_blocks.pop() - if self.log_stats: assert self.prefix_cache_stats is not None self.prefix_cache_stats.queries += len(block_hashes) diff --git a/vllm/v1/core/specialized_manager.py b/vllm/v1/core/specialized_manager.py index 7a8a98361..f04eedf42 100644 --- a/vllm/v1/core/specialized_manager.py +++ b/vllm/v1/core/specialized_manager.py @@ -18,6 +18,7 @@ class SpecializedManager(ABC): self, kv_cache_spec: KVCacheSpec, block_pool: BlockPool, + use_eagle: bool, ) -> None: """ Initializes the SpecializedManager. @@ -30,12 +31,17 @@ class SpecializedManager(ABC): self.kv_cache_spec = kv_cache_spec self.block_pool = block_pool + # Needs special handling for find_longest_cache_hit if eagle is enabled + self.use_eagle = use_eagle + @abstractmethod def find_longest_cache_hit( self, block_hashes: list[BlockHashType]) -> list[KVCacheBlock]: """ Get the longest cache hit prefix of the blocks. If no cache hit is - found, return an empty list. + found, return an empty list. if eagle is enabled, drop the last matched + block to force recompute the last block to get the required hidden + states for eagle drafting head. Args: block_hashes: The block hashes of the request. @@ -79,6 +85,8 @@ class FullAttentionManager(SpecializedManager): computed_blocks.append(cached_block) else: break + if self.use_eagle and len(computed_blocks) > 0: + computed_blocks.pop() return computed_blocks def remove_skipped_blocks(self, blocks: list[KVCacheBlock], @@ -89,14 +97,20 @@ class FullAttentionManager(SpecializedManager): class SlidingWindowManager(SpecializedManager): - def __init__(self, kv_cache_spec: SlidingWindowSpec, - block_pool: BlockPool): - super().__init__(kv_cache_spec, block_pool) + def __init__(self, kv_cache_spec: SlidingWindowSpec, block_pool: BlockPool, + use_eagle: bool): + super().__init__(kv_cache_spec, block_pool, use_eagle) self.sliding_window = kv_cache_spec.sliding_window # The number of contiguous blocks needed for prefix cache hit. # -1 since the input token itself is also included in the window self.sliding_window_contiguous_blocks = cdiv( (kv_cache_spec.sliding_window - 1), self.block_size) + if self.use_eagle: + # Need to drop the last matched block if eagle is enabled. For + # sliding window layer, we achieve this by increasing the number of + # contiguous blocks needed for prefix cache hit by one and dropping + # the last matched block. + self.sliding_window_contiguous_blocks += 1 self._null_block = block_pool.null_block def find_longest_cache_hit( @@ -109,6 +123,7 @@ class SlidingWindowManager(SpecializedManager): computed_blocks = [self._null_block] * len(block_hashes) num_contiguous_blocks = 0 + match_found = False # Search from right to left and early stop when a match is found. for i in range(len(block_hashes) - 1, -1, -1): if cached_block := self.block_pool.get_cached_block( @@ -121,12 +136,16 @@ class SlidingWindowManager(SpecializedManager): # E.g., [NULL, NULL, 8, 3, NULL, 9] -> [NULL, NULL, 8, 3] # when sliding_window_contiguous_blocks=2. del computed_blocks[i + num_contiguous_blocks:] - return computed_blocks + match_found = True + break else: num_contiguous_blocks = 0 - # The first `num_contiguous_blocks` is a cache hit even if - # `num_contiguous_blocks < sliding_window_contiguous_blocks`. - del computed_blocks[num_contiguous_blocks:] + if not match_found: + # The first `num_contiguous_blocks` is a cache hit even if + # `num_contiguous_blocks < sliding_window_contiguous_blocks`. + del computed_blocks[num_contiguous_blocks:] + if self.use_eagle and len(computed_blocks) > 0: + computed_blocks.pop() return computed_blocks def remove_skipped_blocks(self, blocks: list[KVCacheBlock], @@ -155,7 +174,7 @@ spec_manager_map: dict[type[KVCacheSpec], type[SpecializedManager]] = { def get_specialized_manager(kv_cache_spec: KVCacheSpec, - block_pool: BlockPool) -> SpecializedManager: + **kwargs) -> SpecializedManager: manager_class = spec_manager_map[type(kv_cache_spec)] - manager = manager_class(kv_cache_spec, block_pool) + manager = manager_class(kv_cache_spec, **kwargs) return manager -- GitLab From 200bbf92e8861e2458a6f90bca73f40cc3b1ad1f Mon Sep 17 00:00:00 2001 From: Rahul Tuli Date: Wed, 30 Apr 2025 17:24:45 -0500 Subject: [PATCH 071/461] Bump Compressed Tensors version to 0.9.4 (#17478) Signed-off-by: Rahul Tuli Co-authored-by: mgoin --- requirements/common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/common.txt b/requirements/common.txt index 36fc791cc..fba3f3580 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -36,7 +36,7 @@ pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=74.1.1,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. -compressed-tensors == 0.9.3 # required for compressed-tensors +compressed-tensors == 0.9.4 # required for compressed-tensors depyf==0.18.0 # required for profiling and debugging with compilation config cloudpickle # allows pickling lambda functions in model_executor/models/registry.py watchfiles # required for http server to monitor the updates of TLS files -- GitLab From 02bd65484630fc024a14a7cb3c5cb3c8b0ff81b8 Mon Sep 17 00:00:00 2001 From: Alex Brooks Date: Wed, 30 Apr 2025 20:51:36 -0600 Subject: [PATCH 072/461] [Misc] Rename Audios -> Audio in Qwen2audio Processing (#17507) Signed-off-by: Alex-Brooks --- vllm/model_executor/models/qwen2_audio.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 0cb541c6c..f30bf08ab 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -150,8 +150,15 @@ class Qwen2AudioMultiModalProcessor( mm_data: Mapping[str, object], mm_kwargs: Mapping[str, Any], ) -> BatchFeature: + # NOTE - we rename audios -> audio in mm data because transformers has + # deprecated audios for the qwen2audio processor and will remove + # support for it in transformers 4.54. + audios = mm_data.pop("audios", []) + if audios: + mm_data["audio"] = audios + # Text-only input not supported in composite processor - if not mm_data.get("audios", []): + if not mm_data.get("audio", []): prompt_ids = self.info.get_tokenizer().encode(prompt) prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids) return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") -- GitLab From dbc18e78165e72291703f6aaf991267fbbc19d2d Mon Sep 17 00:00:00 2001 From: Siyuan Liu Date: Wed, 30 Apr 2025 19:51:39 -0700 Subject: [PATCH 073/461] [CI][TPU] Skip Multimodal test (#17488) Signed-off-by: Siyuan Liu --- tests/v1/tpu/test_multimodal.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/v1/tpu/test_multimodal.py b/tests/v1/tpu/test_multimodal.py index eb62e0e4b..dbd2e2204 100644 --- a/tests/v1/tpu/test_multimodal.py +++ b/tests/v1/tpu/test_multimodal.py @@ -32,6 +32,8 @@ def base64_encoded_image() -> dict[str, str]: async def test_basic_vision(model_name: str, base64_encoded_image: dict[str, str]): + pytest.skip("Skip this test until it's fixed.") + def whats_in_this_image_msg(b64): return [{ "role": -- GitLab From 08fb5587b4c4c1b031c08adf1b8608d710ab2585 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Wed, 30 Apr 2025 22:51:42 -0400 Subject: [PATCH 074/461] [Bugfix][ROCm] Fix import error on ROCm (#17495) Signed-off-by: Gregory Shtrasberg --- vllm/model_executor/layers/rotary_embedding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index b179a0f00..8cad5482d 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -32,7 +32,7 @@ from transformers import PretrainedConfig from vllm.model_executor.custom_op import CustomOp from vllm.platforms import current_platform -if current_platform.is_cuda_alike(): +if current_platform.is_cuda(): from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb -- GitLab From 1144a8efe715d0274412d215a3ad1f941a8469f1 Mon Sep 17 00:00:00 2001 From: NaLan ZeYu Date: Thu, 1 May 2025 10:51:45 +0800 Subject: [PATCH 075/461] [Bugfix] Temporarily disable gptq_bitblas on ROCm (#17411) Signed-off-by: Yan Cangang --- docs/source/features/quantization/supported_hardware.md | 2 +- vllm/model_executor/layers/quantization/gptq_bitblas.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/source/features/quantization/supported_hardware.md b/docs/source/features/quantization/supported_hardware.md index 984e6626e..08893f0e9 100644 --- a/docs/source/features/quantization/supported_hardware.md +++ b/docs/source/features/quantization/supported_hardware.md @@ -80,7 +80,7 @@ The table below shows the compatibility of various quantization implementations * ✅︎ * ✅︎ * ✅︎ - * ✅︎ + * ❌ * ❌ * ❌ * ❌ diff --git a/vllm/model_executor/layers/quantization/gptq_bitblas.py b/vllm/model_executor/layers/quantization/gptq_bitblas.py index 891d8cdf3..6ee3a2f1b 100644 --- a/vllm/model_executor/layers/quantization/gptq_bitblas.py +++ b/vllm/model_executor/layers/quantization/gptq_bitblas.py @@ -25,6 +25,7 @@ from vllm.model_executor.parameter import (ChannelQuantScaleParameter, PackedColumnParameter, PackedvLLMParameter, RowvLLMParameter) +from vllm.platforms import current_platform from vllm.scalar_type import scalar_types logger = init_logger(__name__) @@ -191,6 +192,10 @@ class GPTQBitBLASConfig(QuantizationConfig): sym = quant_config.get("sym") desc_act = quant_config.get("desc_act") + # temporarily disable on ROCm platform + if not current_platform.is_cuda(): + return False + # If we cannot find the info needed in the config, cannot convert. if (num_bits is None or group_size is None or sym is None or desc_act is None): -- GitLab From 17b4d85f63e0edcf55cc7abc0769c3d0a9cbe340 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 30 Apr 2025 21:36:20 -0600 Subject: [PATCH 076/461] [CI][TPU] Skip structured outputs+spec decode tests on TPU (#17510) Signed-off-by: mgoin --- tests/v1/entrypoints/llm/test_struct_output_generate.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index d25699591..c3ea024f5 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -81,6 +81,9 @@ def test_structured_output( ): monkeypatch.setenv("VLLM_USE_V1", "1") + if current_platform.is_tpu() and speculative_config: + pytest.skip("TPU does not support speculative decoding") + # Don't use eager execution on TPUs because we want to test for no # recompilation at runtime enforce_eager = bool(not current_platform.is_tpu()) -- GitLab From aa4502e7f3d9f53f093bb9b91607617a2223156b Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 30 Apr 2025 22:03:30 -0600 Subject: [PATCH 077/461] [CI][Bugfix] Fix failing V1 Test due to missing 'cache_salt' arg (#17500) Signed-off-by: mgoin --- tests/v1/engine/test_engine_core_client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 3e1aa5688..ae072a47e 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -306,6 +306,7 @@ def test_kv_cache_events( eos_token_id=None, arrival_time=time.time(), lora_request=None, + cache_salt=None, ) client.add_request(request) -- GitLab From afb4429b4f13e744b1630b6c5a09156e5b1ececc Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 1 May 2025 14:03:08 +0800 Subject: [PATCH 078/461] [CI/Build] Reorganize models tests (#17459) Signed-off-by: DarkLight1337 --- .buildkite/test-pipeline.yaml | 86 ++++---- pyproject.toml | 1 - tests/entrypoints/openai/test_embedding.py | 12 +- .../openai/test_embedding_dimensions.py | 5 +- tests/models/embedding/utils.py | 66 ------ .../embedding/vision_language/__init__.py | 0 tests/models/encoder_decoder/__init__.py | 0 .../audio_language/__init__.py | 0 .../encoder_decoder/language/__init__.py | 0 .../vision_language/__init__.py | 0 .../vision_language/test_broadcast.py | 37 ---- .../{decoder_only => language}/__init__.py | 0 .../generation}/__init__.py | 0 .../generation}/test_bart.py | 4 - .../generation}/test_granite.py | 0 .../generation}/test_hybrid.py | 18 +- .../generation}/test_mistral.py | 0 .../generation}/test_models.py | 0 .../generation}/test_phimoe.py | 0 .../language => language/pooling}/__init__.py | 0 .../pooling}/test_cls_models.py | 0 .../pooling}/test_embedding.py | 2 +- .../pooling}/test_gritlm.py | 189 +++++++++--------- .../pooling}/test_jina.py | 3 +- .../pooling}/test_scoring.py | 0 .../pooling}/test_snowflake_arctic_embed.py | 4 +- .../pooling}/test_truncation_control.py | 0 .../generation}/__init__.py | 0 .../generation/test_common.py} | 7 + .../generation}/test_florence2.py | 0 .../generation}/test_granite_speech.py | 0 .../generation}/test_interleaved.py | 1 + .../generation}/test_intern_vit.py | 0 .../generation}/test_mllama.py | 34 +++- .../generation}/test_phi4mm.py | 0 .../generation}/test_pixtral.py | 0 .../generation}/test_qwen2_vl.py | 0 .../generation}/test_ultravox.py | 0 .../generation}/test_whisper.py | 58 +++--- .../generation}/vlm_utils/__init__.py | 0 .../generation}/vlm_utils/builders.py | 0 .../generation}/vlm_utils/case_filtering.py | 0 .../generation}/vlm_utils/core.py | 0 .../generation}/vlm_utils/custom_inputs.py | 0 .../generation}/vlm_utils/model_utils.py | 0 .../generation}/vlm_utils/runners.py | 0 .../generation}/vlm_utils/types.py | 0 .../pooling}/__init__.py | 0 .../pooling}/test_dse_qwen2_vl.py | 2 +- .../pooling}/test_llava_next.py | 2 +- .../pooling}/test_phi3v.py | 2 +- .../language => quantization}/__init__.py | 0 .../language => quantization}/test_aqlm.py | 1 - .../test_awq.py | 5 +- .../models/{ => quantization}/test_bitblas.py | 2 +- .../language => quantization}/test_fp8.py | 7 +- .../language => quantization}/test_gguf.py | 7 +- .../{ => quantization}/test_gptq_bitblas.py | 2 +- .../test_gptq_marlin.py | 3 +- .../test_gptq_marlin_24.py | 3 +- .../test_modelopt.py | 1 - .../language => quantization}/test_nvfp4.py | 1 - tests/models/utils.py | 66 +++++- vllm/config.py | 4 +- vllm/model_executor/models/llama.py | 6 +- 65 files changed, 317 insertions(+), 324 deletions(-) delete mode 100644 tests/models/embedding/utils.py delete mode 100644 tests/models/embedding/vision_language/__init__.py delete mode 100644 tests/models/encoder_decoder/__init__.py delete mode 100644 tests/models/encoder_decoder/audio_language/__init__.py delete mode 100644 tests/models/encoder_decoder/language/__init__.py delete mode 100644 tests/models/encoder_decoder/vision_language/__init__.py delete mode 100644 tests/models/encoder_decoder/vision_language/test_broadcast.py rename tests/models/{decoder_only => language}/__init__.py (100%) rename tests/models/{decoder_only/audio_language => language/generation}/__init__.py (100%) rename tests/models/{encoder_decoder/language => language/generation}/test_bart.py (98%) rename tests/models/{decoder_only/language => language/generation}/test_granite.py (100%) rename tests/models/{decoder_only/language => language/generation}/test_hybrid.py (96%) rename tests/models/{decoder_only/language => language/generation}/test_mistral.py (100%) rename tests/models/{decoder_only/language => language/generation}/test_models.py (100%) rename tests/models/{decoder_only/language => language/generation}/test_phimoe.py (100%) rename tests/models/{decoder_only/language => language/pooling}/__init__.py (100%) rename tests/models/{embedding/language => language/pooling}/test_cls_models.py (100%) rename tests/models/{embedding/language => language/pooling}/test_embedding.py (98%) rename tests/models/{embedding/language => language/pooling}/test_gritlm.py (64%) rename tests/models/{embedding/language => language/pooling}/test_jina.py (98%) rename tests/models/{embedding/language => language/pooling}/test_scoring.py (100%) rename tests/models/{embedding/language => language/pooling}/test_snowflake_arctic_embed.py (97%) rename tests/models/{embedding/language => language/pooling}/test_truncation_control.py (100%) rename tests/models/{decoder_only/vision_language => multimodal/generation}/__init__.py (100%) rename tests/models/{decoder_only/vision_language/test_models.py => multimodal/generation/test_common.py} (98%) rename tests/models/{encoder_decoder/vision_language => multimodal/generation}/test_florence2.py (100%) rename tests/models/{decoder_only/audio_language => multimodal/generation}/test_granite_speech.py (100%) rename tests/models/{decoder_only/vision_language => multimodal/generation}/test_interleaved.py (99%) rename tests/models/{decoder_only/vision_language => multimodal/generation}/test_intern_vit.py (100%) rename tests/models/{encoder_decoder/vision_language => multimodal/generation}/test_mllama.py (96%) rename tests/models/{decoder_only/vision_language => multimodal/generation}/test_phi4mm.py (100%) rename tests/models/{decoder_only/vision_language => multimodal/generation}/test_pixtral.py (100%) rename tests/models/{decoder_only/vision_language => multimodal/generation}/test_qwen2_vl.py (100%) rename tests/models/{decoder_only/audio_language => multimodal/generation}/test_ultravox.py (100%) rename tests/models/{encoder_decoder/audio_language => multimodal/generation}/test_whisper.py (83%) rename tests/models/{decoder_only/vision_language => multimodal/generation}/vlm_utils/__init__.py (100%) rename tests/models/{decoder_only/vision_language => multimodal/generation}/vlm_utils/builders.py (100%) rename tests/models/{decoder_only/vision_language => multimodal/generation}/vlm_utils/case_filtering.py (100%) rename tests/models/{decoder_only/vision_language => multimodal/generation}/vlm_utils/core.py (100%) rename tests/models/{decoder_only/vision_language => multimodal/generation}/vlm_utils/custom_inputs.py (100%) rename tests/models/{decoder_only/vision_language => multimodal/generation}/vlm_utils/model_utils.py (100%) rename tests/models/{decoder_only/vision_language => multimodal/generation}/vlm_utils/runners.py (100%) rename tests/models/{decoder_only/vision_language => multimodal/generation}/vlm_utils/types.py (100%) rename tests/models/{embedding => multimodal/pooling}/__init__.py (100%) rename tests/models/{embedding/vision_language => multimodal/pooling}/test_dse_qwen2_vl.py (99%) rename tests/models/{embedding/vision_language => multimodal/pooling}/test_llava_next.py (99%) rename tests/models/{embedding/vision_language => multimodal/pooling}/test_phi3v.py (98%) rename tests/models/{embedding/language => quantization}/__init__.py (100%) rename tests/models/{decoder_only/language => quantization}/test_aqlm.py (99%) rename tests/models/{decoder_only/vision_language => quantization}/test_awq.py (97%) rename tests/models/{ => quantization}/test_bitblas.py (97%) rename tests/models/{decoder_only/language => quantization}/test_fp8.py (97%) rename tests/models/{decoder_only/language => quantization}/test_gguf.py (97%) rename tests/models/{ => quantization}/test_gptq_bitblas.py (97%) rename tests/models/{decoder_only/language => quantization}/test_gptq_marlin.py (97%) rename tests/models/{decoder_only/language => quantization}/test_gptq_marlin_24.py (97%) rename tests/models/{decoder_only/language => quantization}/test_modelopt.py (99%) rename tests/models/{decoder_only/language => quantization}/test_nvfp4.py (99%) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8da43322c..13ed64ed0 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -390,12 +390,15 @@ steps: commands: - pytest -v -s benchmarks/ -- label: Quantization Test # 33min +- label: Quantization Test source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization - tests/quantization - command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization + - tests/models/quantization + commands: + - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization + - pytest -v -s models/quantization - label: LM Eval Small Models # 53min working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" @@ -441,82 +444,70 @@ steps: commands: - pytest -v -s models/test_transformers.py - pytest -v -s models/test_registry.py + - pytest -v -s models/test_utils.py + - pytest -v -s models/test_vision.py # V1 Test: https://github.com/vllm-project/vllm/issues/14531 - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2' - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4' - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2' -- label: Language Models Test (Standard) # 32min +- label: Language Models Test (Standard) #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - - tests/models/decoder_only/language - - tests/models/embedding/language - - tests/models/encoder_decoder/language + - tests/models/language commands: # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' - - pytest -v -s models/decoder_only/language -m 'core_model or quant_model' - - pytest -v -s models/embedding/language -m core_model + - pytest -v -s models/language -m core_model -- label: Language Models Test (Extended) # 1h10min +- label: Language Models Test (Extended) optional: true source_file_dependencies: - vllm/ - - tests/models/decoder_only/language - - tests/models/embedding/language - - tests/models/encoder_decoder/language + - tests/models/language commands: # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. - - pip install causal-conv1d - - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model' - - pytest -v -s models/embedding/language -m 'not core_model' + - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' + - pytest -v -s models/language -m 'not core_model' -- label: Multi-Modal Models Test (Standard) # 40min +- label: Multi-Modal Models Test (Standard) #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - - tests/models/decoder_only/audio_language - - tests/models/decoder_only/vision_language - - tests/models/embedding/vision_language - - tests/models/encoder_decoder/audio_language - - tests/models/encoder_decoder/vision_language + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing + - pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model + - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work + +- label: Multi-Modal Models Test (Extended) 1 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal - - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model' - - pytest -v -s models/decoder_only/vision_language -m 'core_model or quant_model' - - pytest -v -s models/embedding/vision_language -m core_model - - pytest -v -s models/encoder_decoder/audio_language -m core_model - - pytest -v -s models/encoder_decoder/language -m core_model - - pytest -v -s models/encoder_decoder/vision_language -m core_model - - pytest -v -s models/decoder_only/vision_language/test_interleaved.py - -- label: Multi-Modal Models Test (Extended) 1 # 48m + - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model' + +- label: Multi-Modal Models Test (Extended) 2 optional: true source_file_dependencies: - vllm/ - - tests/models/decoder_only/audio_language - - tests/models/decoder_only/vision_language - - tests/models/embedding/vision_language - - tests/models/encoder_decoder/vision_language + - tests/models/multimodal commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model' - - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model' - - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py models/decoder_only/vision_language -m 'not core_model and not quant_model' - - pytest -v -s models/embedding/vision_language -m 'not core_model' - - pytest -v -s models/encoder_decoder/language -m 'not core_model' - - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model' - -- label: Multi-Modal Models Test (Extended) 2 # 38m + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' + +- label: Multi-Modal Models Test (Extended) 3 optional: true source_file_dependencies: - vllm/ - - tests/models/decoder_only/vision_language + - tests/models/multimodal commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model' + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' # This test is used only in PR development phase to test individual models and should never run on main - label: Custom Models Test @@ -586,9 +577,8 @@ steps: - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' # Avoid importing model tests that cause CUDA reinitialization error - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' - - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)' - - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)' - - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)' + - pytest models/language -v -s -m 'distributed(num_gpus=2)' + - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' # test sequence parallel - pytest -v -s distributed/test_sequence_parallel.py # this test fails consistently. diff --git a/pyproject.toml b/pyproject.toml index c85e85b0c..e51d4c9a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -158,7 +158,6 @@ markers = [ "skip_global_cleanup", "core_model: enable this model test in each PR instead of only nightly", "cpu_model: enable this model test in CPU tests", - "quant_model: run this model test under Quantized category", "split: run this test as part of a split", "distributed: run this test only in distributed GPU tests", "skip_v1: do not run this test with v1", diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index 50b20e78c..1019bfd58 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -11,7 +11,7 @@ import requests from vllm.entrypoints.openai.protocol import EmbeddingResponse from vllm.transformers_utils.tokenizer import get_tokenizer -from ...models.embedding.utils import correctness_test +from ...models.utils import run_embedding_correctness_test from ...utils import RemoteOpenAIServer MODEL_NAME = "intfloat/multilingual-e5-small" @@ -76,7 +76,7 @@ async def test_single_embedding(hf_model, client: openai.AsyncOpenAI, assert embeddings.usage.total_tokens == 11 vllm_outputs = [d.embedding for d in embeddings.data] - correctness_test(hf_model, input_texts, vllm_outputs) + run_embedding_correctness_test(hf_model, input_texts, vllm_outputs) # test using token IDs input_tokens = [1, 1, 1, 1, 1] @@ -121,7 +121,7 @@ async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI, assert embeddings.usage.total_tokens == 33 vllm_outputs = [d.embedding for d in embeddings.data] - correctness_test(hf_model, input_texts, vllm_outputs) + run_embedding_correctness_test(hf_model, input_texts, vllm_outputs) # test list[list[int]] input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], @@ -208,7 +208,7 @@ async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI, model=model_name, encoding_format="float") float_data = [d.embedding for d in responses_float.data] - correctness_test(hf_model, input_texts, float_data) + run_embedding_correctness_test(hf_model, input_texts, float_data) responses_base64 = await client.embeddings.create(input=input_texts, model=model_name, @@ -219,13 +219,13 @@ async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI, np.frombuffer(base64.b64decode(data.embedding), dtype="float32").tolist()) - correctness_test(hf_model, input_texts, base64_data) + run_embedding_correctness_test(hf_model, input_texts, base64_data) # Default response is float32 decoded from base64 by OpenAI Client responses_default = await client.embeddings.create(input=input_texts, model=model_name) default_data = [d.embedding for d in responses_default.data] - correctness_test(hf_model, input_texts, default_data) + run_embedding_correctness_test(hf_model, input_texts, default_data) @pytest.mark.asyncio diff --git a/tests/entrypoints/openai/test_embedding_dimensions.py b/tests/entrypoints/openai/test_embedding_dimensions.py index 9f5a8c683..332fa332a 100644 --- a/tests/entrypoints/openai/test_embedding_dimensions.py +++ b/tests/entrypoints/openai/test_embedding_dimensions.py @@ -11,7 +11,7 @@ import pytest from vllm.entrypoints.openai.protocol import EmbeddingResponse from ...conftest import HfRunner -from ...models.embedding.utils import EmbedModelInfo, correctness_test +from ...models.utils import EmbedModelInfo, run_embedding_correctness_test from ...utils import RemoteOpenAIServer MODELS = [ @@ -95,7 +95,8 @@ async def test_matryoshka(model_info: EmbedModelInfo, assert len(embeddings.data[0].embedding) == dimensions vllm_outputs = [d.embedding for d in embeddings.data] - correctness_test(hf_model, prompts, vllm_outputs, dimensions) + run_embedding_correctness_test(hf_model, prompts, vllm_outputs, + dimensions) if model_info.is_matryoshka: valid_dimensions: list[Optional[int]] = [None] diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py deleted file mode 100644 index 6d4df2c26..000000000 --- a/tests/models/embedding/utils.py +++ /dev/null @@ -1,66 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -from collections.abc import Sequence -from typing import NamedTuple, Optional - -import torch -import torch.nn.functional as F - - -def check_embeddings_close( - *, - embeddings_0_lst: Sequence[list[float]], - embeddings_1_lst: Sequence[list[float]], - name_0: str, - name_1: str, - tol: float = 1e-3, -) -> None: - assert len(embeddings_0_lst) == len(embeddings_1_lst) - - for prompt_idx, (embeddings_0, embeddings_1) in enumerate( - zip(embeddings_0_lst, embeddings_1_lst)): - assert len(embeddings_0) == len(embeddings_1), ( - f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}") - - sim = F.cosine_similarity(torch.tensor(embeddings_0), - torch.tensor(embeddings_1), - dim=0) - - fail_msg = (f"Test{prompt_idx}:" - f"\n{name_0}:\t{embeddings_0[:16]!r}" - f"\n{name_1}:\t{embeddings_1[:16]!r}") - - assert sim >= 1 - tol, fail_msg - - -def matryoshka_fy(tensor, dimensions): - tensor = torch.tensor(tensor) - tensor = tensor[..., :dimensions] - tensor = F.normalize(tensor, p=2, dim=1) - return tensor - - -class EmbedModelInfo(NamedTuple): - name: str - is_matryoshka: bool - matryoshka_dimensions: Optional[list[int]] = None - architecture: str = "" - enable_test: bool = True - - -def correctness_test(hf_model, - inputs, - vllm_outputs: Sequence[list[float]], - dimensions: Optional[int] = None): - - hf_outputs = hf_model.encode(inputs) - if dimensions: - hf_outputs = matryoshka_fy(hf_outputs, dimensions) - - check_embeddings_close( - embeddings_0_lst=hf_outputs, - embeddings_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - tol=1e-2, - ) diff --git a/tests/models/embedding/vision_language/__init__.py b/tests/models/embedding/vision_language/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tests/models/encoder_decoder/__init__.py b/tests/models/encoder_decoder/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tests/models/encoder_decoder/audio_language/__init__.py b/tests/models/encoder_decoder/audio_language/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tests/models/encoder_decoder/language/__init__.py b/tests/models/encoder_decoder/language/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tests/models/encoder_decoder/vision_language/__init__.py b/tests/models/encoder_decoder/vision_language/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tests/models/encoder_decoder/vision_language/test_broadcast.py b/tests/models/encoder_decoder/vision_language/test_broadcast.py deleted file mode 100644 index 8d986414e..000000000 --- a/tests/models/encoder_decoder/vision_language/test_broadcast.py +++ /dev/null @@ -1,37 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -import pytest - -from ....utils import multi_gpu_test - - -@multi_gpu_test(num_gpus=2) -@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) -@pytest.mark.parametrize("model", [ - "meta-llama/Llama-3.2-11B-Vision-Instruct", -]) -def test_models(hf_runner, vllm_runner, image_assets, - distributed_executor_backend, model) -> None: - - dtype = "half" - max_tokens = 5 - num_logprobs = 5 - tensor_parallel_size = 2 - - if model.startswith("meta-llama/Llama-3.2-11B-Vision-Instruct"): - from .test_mllama import models, run_test - else: - raise NotImplementedError(f"Unsupported model: {model}") - - run_test( - hf_runner, - vllm_runner, - image_assets, - model=models[0], - size_factors=[0.25, 0.5, 1.0], - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - ) diff --git a/tests/models/decoder_only/__init__.py b/tests/models/language/__init__.py similarity index 100% rename from tests/models/decoder_only/__init__.py rename to tests/models/language/__init__.py diff --git a/tests/models/decoder_only/audio_language/__init__.py b/tests/models/language/generation/__init__.py similarity index 100% rename from tests/models/decoder_only/audio_language/__init__.py rename to tests/models/language/generation/__init__.py diff --git a/tests/models/encoder_decoder/language/test_bart.py b/tests/models/language/generation/test_bart.py similarity index 98% rename from tests/models/encoder_decoder/language/test_bart.py rename to tests/models/language/generation/test_bart.py index e8070d28b..8ab0167dc 100644 --- a/tests/models/encoder_decoder/language/test_bart.py +++ b/tests/models/language/generation/test_bart.py @@ -1,8 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -"""Compare the outputs of HF and vLLM for BART models using greedy sampling. - -Run `pytest tests/models/encoder_decoder/language/test_bart.py`. -""" from typing import Optional import pytest diff --git a/tests/models/decoder_only/language/test_granite.py b/tests/models/language/generation/test_granite.py similarity index 100% rename from tests/models/decoder_only/language/test_granite.py rename to tests/models/language/generation/test_granite.py diff --git a/tests/models/decoder_only/language/test_hybrid.py b/tests/models/language/generation/test_hybrid.py similarity index 96% rename from tests/models/decoder_only/language/test_hybrid.py rename to tests/models/language/generation/test_hybrid.py index e5e0c28ae..880967b4a 100644 --- a/tests/models/decoder_only/language/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -289,23 +289,25 @@ def test_multistep_correctness( @multi_gpu_test(num_gpus=2) @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]]) @pytest.mark.parametrize("max_tokens", [64]) -def test_hybrid_distributed_produces_identical_generation( +@pytest.mark.parametrize("num_logprobs", [5]) +def test_distributed_correctness( vllm_runner, example_prompts, model: str, max_tokens: int, + num_logprobs: int, ) -> None: - with vllm_runner(model, tensor_parallel_size=2, + with vllm_runner(model, tensor_parallel_size=1, max_num_seqs=2) as vllm_model: - vllm_outputs_tp_2 = vllm_model.generate_greedy(example_prompts, - max_tokens) + vllm_outputs_tp_1 = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) - with vllm_runner(model, tensor_parallel_size=1, + with vllm_runner(model, tensor_parallel_size=2, max_num_seqs=2) as vllm_model: - vllm_outputs_tp_1 = vllm_model.generate_greedy(example_prompts, - max_tokens) + vllm_outputs_tp_2 = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) - check_outputs_equal( + check_logprobs_close( outputs_0_lst=vllm_outputs_tp_1, outputs_1_lst=vllm_outputs_tp_2, name_0="vllm_tp_1", diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/language/generation/test_mistral.py similarity index 100% rename from tests/models/decoder_only/language/test_mistral.py rename to tests/models/language/generation/test_mistral.py diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/language/generation/test_models.py similarity index 100% rename from tests/models/decoder_only/language/test_models.py rename to tests/models/language/generation/test_models.py diff --git a/tests/models/decoder_only/language/test_phimoe.py b/tests/models/language/generation/test_phimoe.py similarity index 100% rename from tests/models/decoder_only/language/test_phimoe.py rename to tests/models/language/generation/test_phimoe.py diff --git a/tests/models/decoder_only/language/__init__.py b/tests/models/language/pooling/__init__.py similarity index 100% rename from tests/models/decoder_only/language/__init__.py rename to tests/models/language/pooling/__init__.py diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/language/pooling/test_cls_models.py similarity index 100% rename from tests/models/embedding/language/test_cls_models.py rename to tests/models/language/pooling/test_cls_models.py diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/language/pooling/test_embedding.py similarity index 98% rename from tests/models/embedding/language/test_embedding.py rename to tests/models/language/pooling/test_embedding.py index 5deb35fa3..2a90f47af 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -8,7 +8,7 @@ import pytest from vllm.config import PoolerConfig from vllm.platforms import current_platform -from ..utils import check_embeddings_close +from ...utils import check_embeddings_close @pytest.mark.parametrize( diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py similarity index 64% rename from tests/models/embedding/language/test_gritlm.py rename to tests/models/language/pooling/test_gritlm.py index 87a1dde93..3ad6e7190 100644 --- a/tests/models/embedding/language/test_gritlm.py +++ b/tests/models/language/pooling/test_gritlm.py @@ -7,11 +7,10 @@ from array import array import openai import pytest -import pytest_asyncio from scipy.spatial.distance import cosine -import vllm -import vllm.config +from vllm import LLM, SamplingParams +from vllm.config import ModelConfig from vllm.utils import STR_BACKEND_ENV_VAR from ....utils import RemoteOpenAIServer @@ -31,73 +30,45 @@ def _arr(arr): return array("i", arr) -def test_find_array(monkeypatch: pytest.MonkeyPatch): - # GritLM embedding implementation is only supported by XFormers backend. - with monkeypatch.context() as m: - m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS") - - from vllm.model_executor.models.gritlm import GritLMPooler - - # Create an LLM object to get the model config. - llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN) - pooler = GritLMPooler(model_config=llm.llm_engine.model_config) - - arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) - - assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3 - assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3 - assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1 - assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1 - - with pytest.raises(ValueError): - pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1) - - -@pytest.fixture(scope="module") -def server_embedding(): - # GritLM embedding implementation is only supported by XFormers backend. - args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)] - with pytest.MonkeyPatch.context() as m: - m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS") - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: - yield remote_server - - -@pytest.fixture(scope="module") -def server_generate(): - args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)] - with pytest.MonkeyPatch.context() as m: - m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS") - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: - yield remote_server +def test_find_array(): + from vllm.model_executor.models.gritlm import GritLMPooler + model_config = ModelConfig( + MODEL_NAME, + task="embed", + tokenizer=MODEL_NAME, + tokenizer_mode="auto", + trust_remote_code=False, + dtype="bfloat16", + seed=0, + ) + pooler = GritLMPooler(model_config=model_config) -@pytest_asyncio.fixture -async def client_embedding(server_embedding: RemoteOpenAIServer): - async with server_embedding.get_async_client() as async_client: - yield async_client + arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3 + assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3 + assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1 + assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1 -@pytest_asyncio.fixture -async def client_generate(server_generate: RemoteOpenAIServer): - async with server_generate.get_async_client() as async_client: - yield async_client + with pytest.raises(ValueError): + pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1) def run_llm_encode( - llm: vllm.LLM, + llm: LLM, queries: list[str], instruction: str, -) -> list[float]: - outputs = llm.encode([instruction + q for q in queries], ) +) -> list[list[float]]: + outputs = llm.embed([instruction + q for q in queries]) return [output.outputs.embedding for output in outputs] async def run_client_embeddings( - client: vllm.LLM, + client: openai.AsyncOpenAI, queries: list[str], instruction: str, -) -> list[float]: +) -> list[list[float]]: outputs = await client.embeddings.create( model=MODEL_NAME, input=[instruction + q for q in queries], @@ -132,7 +103,7 @@ def get_test_data(): return queries, q_instruction, documents, d_instruction -def validate_embed_output(q_rep: list[float], d_rep: list[float]): +def validate_embed_output(q_rep: list[list[float]], d_rep: list[list[float]]): cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0]) assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001) @@ -143,70 +114,100 @@ def validate_embed_output(q_rep: list[float], d_rep: list[float]): assert math.isclose(cosine_sim_q1_d0, 0.120, abs_tol=0.001) cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1]) - assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001) + assert math.isclose(cosine_sim_q1_d1, 0.534, abs_tol=0.001) -def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch): +def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch, + vllm_runner): # GritLM embedding implementation is only supported by XFormers backend. with monkeypatch.context() as m: m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS") queries, q_instruction, documents, d_instruction = get_test_data() - llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN) + with vllm_runner( + MODEL_NAME, + task="embed", + max_model_len=MAX_MODEL_LEN, + ) as vllm_model: + llm = vllm_model.model + + d_rep = run_llm_encode( + llm, + documents, + d_instruction, + ) + q_rep = run_llm_encode( + llm, + queries, + q_instruction, + ) + + validate_embed_output(q_rep, d_rep) + + +@pytest.mark.asyncio +async def test_gritlm_api_server_embedding(): + queries, q_instruction, documents, d_instruction = get_test_data() + + # GritLM embedding implementation is only supported by XFormers backend. + args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)] + env_dict = {STR_BACKEND_ENV_VAR: "XFORMERS"} + + with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as server: + client_embedding = server.get_async_client() - d_rep = run_llm_encode( - llm, + d_rep = await run_client_embeddings( + client_embedding, documents, d_instruction, ) - q_rep = run_llm_encode( - llm, + q_rep = await run_client_embeddings( + client_embedding, queries, q_instruction, ) - validate_embed_output(q_rep, d_rep) - - -@pytest.mark.asyncio -async def test_gritlm_api_server_embedding( - client_embedding: openai.AsyncOpenAI, ): - queries, q_instruction, documents, d_instruction = get_test_data() + validate_embed_output(q_rep, d_rep) - d_rep = await run_client_embeddings( - client_embedding, - documents, - d_instruction, - ) - q_rep = await run_client_embeddings( - client_embedding, - queries, - q_instruction, - ) - validate_embed_output(q_rep, d_rep) +def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner): + # GritLM embedding implementation is only supported by XFormers backend. + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "0") + m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS") + input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n" -def test_gritlm_offline_gen(): - input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n" + with vllm_runner( + MODEL_NAME, + task="generate", + max_model_len=MAX_MODEL_LEN, + ) as vllm_model: + llm = vllm_model.model - llm = vllm.LLM(MODEL_NAME, max_model_len=MAX_MODEL_LEN) - sampling_params = vllm.SamplingParams(temperature=0.0, max_tokens=256) - outputs = llm.generate(input, sampling_params=sampling_params) + sampling_params = SamplingParams(temperature=0.0, max_tokens=256) + outputs = llm.generate(input, sampling_params=sampling_params) - assert outputs[0].outputs[0].text == "The capital of France is Paris." + assert outputs[0].outputs[0].text == "The capital of France is Paris." @pytest.mark.asyncio -async def test_gritlm_api_server_gen(client_generate: openai.AsyncOpenAI): +async def test_gritlm_api_server_generate(): input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n" - outputs = await client_generate.completions.create( - model=MODEL_NAME, - prompt=input, - max_tokens=256, - temperature=0.0, - ) + # GritLM embedding implementation is only supported by XFormers backend. + args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)] + env_dict = {"VLLM_USE_V1": "0", STR_BACKEND_ENV_VAR: "XFORMERS"} + + with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as server: + client_generate = server.get_async_client() + + outputs = await client_generate.completions.create( + model=MODEL_NAME, + prompt=input, + max_tokens=256, + temperature=0.0, + ) assert outputs.choices[0].text == "The capital of France is Paris." diff --git a/tests/models/embedding/language/test_jina.py b/tests/models/language/pooling/test_jina.py similarity index 98% rename from tests/models/embedding/language/test_jina.py rename to tests/models/language/pooling/test_jina.py index 1e234368f..154aefe59 100644 --- a/tests/models/embedding/language/test_jina.py +++ b/tests/models/language/pooling/test_jina.py @@ -8,9 +8,10 @@ import math import pytest -from tests.models.embedding.utils import check_embeddings_close, matryoshka_fy from vllm import PoolingParams +from ...utils import check_embeddings_close, matryoshka_fy + SCORING_MODELS = [ "jinaai/jina-reranker-v2-base-multilingual", # Roberta ] diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/language/pooling/test_scoring.py similarity index 100% rename from tests/models/embedding/language/test_scoring.py rename to tests/models/language/pooling/test_scoring.py diff --git a/tests/models/embedding/language/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py similarity index 97% rename from tests/models/embedding/language/test_snowflake_arctic_embed.py rename to tests/models/language/pooling/test_snowflake_arctic_embed.py index 2b884fcee..81abc0e9e 100644 --- a/tests/models/embedding/language/test_snowflake_arctic_embed.py +++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py @@ -5,9 +5,7 @@ Run `pytest tests/models/embedding/language/test_snowflake_arctic_embed.py`. """ import pytest -from tests.models.embedding.utils import EmbedModelInfo - -from ..utils import check_embeddings_close +from ...utils import EmbedModelInfo, check_embeddings_close EMBEDDING_PROMPTS = [ 'what is snowflake?', 'Where can I get the best tacos?', 'The Data Cloud!', diff --git a/tests/models/embedding/language/test_truncation_control.py b/tests/models/language/pooling/test_truncation_control.py similarity index 100% rename from tests/models/embedding/language/test_truncation_control.py rename to tests/models/language/pooling/test_truncation_control.py diff --git a/tests/models/decoder_only/vision_language/__init__.py b/tests/models/multimodal/generation/__init__.py similarity index 100% rename from tests/models/decoder_only/vision_language/__init__.py rename to tests/models/multimodal/generation/__init__.py diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/multimodal/generation/test_common.py similarity index 98% rename from tests/models/decoder_only/vision_language/test_models.py rename to tests/models/multimodal/generation/test_common.py index 3dd82b93f..b21c80bef 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/multimodal/generation/test_common.py @@ -267,6 +267,7 @@ VLM_TEST_SETTINGS = { multi_image_prompt="Describe the two images in detail.", # noqa: E501 max_model_len=4096, max_num_seqs=2, + dtype="bfloat16", auto_cls=AutoModelForImageTextToText, vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}}, patch_hf_runner=model_utils.gemma3_patch_hf_runner, @@ -423,6 +424,8 @@ VLM_TEST_SETTINGS = { get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id], hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner, + # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55 + marks=[pytest.mark.skip("HF import fails")], ), "minicpmo_26": VLMTestInfo( models=["openbmb/MiniCPM-o-2_6"], @@ -434,6 +437,8 @@ VLM_TEST_SETTINGS = { get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501 hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner, + # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55 + marks=[pytest.mark.skip("HF import fails")], ), "minicpmv_26": VLMTestInfo( models=["openbmb/MiniCPM-V-2_6"], @@ -445,6 +450,8 @@ VLM_TEST_SETTINGS = { get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501 hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner, + # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55 + marks=[pytest.mark.skip("HF import fails")], ), "minimax_vl_01": VLMTestInfo( models=["MiniMaxAI/MiniMax-VL-01"], diff --git a/tests/models/encoder_decoder/vision_language/test_florence2.py b/tests/models/multimodal/generation/test_florence2.py similarity index 100% rename from tests/models/encoder_decoder/vision_language/test_florence2.py rename to tests/models/multimodal/generation/test_florence2.py diff --git a/tests/models/decoder_only/audio_language/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py similarity index 100% rename from tests/models/decoder_only/audio_language/test_granite_speech.py rename to tests/models/multimodal/generation/test_granite_speech.py diff --git a/tests/models/decoder_only/vision_language/test_interleaved.py b/tests/models/multimodal/generation/test_interleaved.py similarity index 99% rename from tests/models/decoder_only/vision_language/test_interleaved.py rename to tests/models/multimodal/generation/test_interleaved.py index 8804497ae..92c8155fe 100644 --- a/tests/models/decoder_only/vision_language/test_interleaved.py +++ b/tests/models/multimodal/generation/test_interleaved.py @@ -16,6 +16,7 @@ INTERLEAVED_PROMPT = base_prompt("