Merge tag 'v0.9.1' into v0.9.1-ori

cc7f22a8 · zhuwenwen · b9ea0c09 · b6553be1 · cc7f22a8 · cc7f22a8
Commit cc7f22a8 authored Jun 11, 2025 by zhuwenwen
20 changed files
--- a/tests/lora/test_tokenizer_group.py
+++ b/tests/lora/test_tokenizer_group.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import pytest
 from transformers import AutoTokenizer, PreTrainedTokenizerBase

--- a/tests/lora/test_transfomers_model.py
+++ b/tests/lora/test_transfomers_model.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import pytest


--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from collections import OrderedDict
 from typing import NamedTuple, Optional

--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import os
 import random
@@ -6,8 +7,6 @@ import tempfile
 from typing import Union
 from unittest.mock import patch

-import pytest
-
 import vllm.envs as envs
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                         ModelConfig, ParallelConfig, SchedulerConfig,
@@ -17,13 +16,7 @@ from vllm.lora.request import LoRARequest
 from vllm.v1.worker.gpu_worker import Worker as V1Worker
 from vllm.worker.worker import Worker

-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
+NUM_LORAS = 16


 @patch.dict(os.environ, {"RANK": "0"})
@@ -67,12 +60,12 @@ def test_worker_apply_lora(sql_lora_files):
        device_config=DeviceConfig("cuda"),
        cache_config=CacheConfig(
            block_size=16,
-            gpu_memory_utilization=1.0,
            swap_space=0,
            cache_dtype="auto",
        ),
-        lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
-                               max_loras=32),
+        lora_config=LoRAConfig(max_lora_rank=8,
+                               max_cpu_loras=NUM_LORAS,
+                               max_loras=NUM_LORAS),
    )
    worker = worker_cls(
        vllm_config=vllm_config,
@@ -87,9 +80,9 @@ def test_worker_apply_lora(sql_lora_files):
    set_active_loras(worker, [])
    assert worker.list_loras() == set()

-    n_loras = 32
    lora_requests = [
-        LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras)
+        LoRARequest(str(i + 1), i + 1, sql_lora_files)
+        for i in range(NUM_LORAS)
    ]

    set_active_loras(worker, lora_requests)
@@ -98,12 +91,12 @@ def test_worker_apply_lora(sql_lora_files):
        for lora_request in lora_requests
    }

-    for i in range(32):
+    for i in range(NUM_LORAS):
        random.seed(i)
        iter_lora_requests = random.choices(lora_requests,
-                                            k=random.randint(1, n_loras))
+                                            k=random.randint(1, NUM_LORAS))
        random.shuffle(iter_lora_requests)
-        iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)]
+        iter_lora_requests = iter_lora_requests[:-random.randint(0, NUM_LORAS)]
        set_active_loras(worker, lora_requests)
        assert worker.list_loras().issuperset(
            {lora_request.lora_int_id

--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from dataclasses import dataclass
 from typing import Optional, Union

--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import time


--- a/tests/mistral_tool_use/conftest.py
+++ b/tests/mistral_tool_use/conftest.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import pytest
 import pytest_asyncio

--- a/tests/mistral_tool_use/test_mistral_tool_calls.py
+++ b/tests/mistral_tool_use/test_mistral_tool_calls.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import openai
 import pytest

--- a/tests/mistral_tool_use/utils.py
+++ b/tests/mistral_tool_use/utils.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from typing import Optional


--- a/tests/model_executor/conftest.py
+++ b/tests/model_executor/conftest.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import pytest


--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import pytest
 import torch

--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import json
 import pickle

--- a/tests/test_logits_processor.py
+++ b/tests/test_logits_processor.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import random
 from unittest.mock import patch

--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import os

 import pytest

-from vllm.model_executor.layers.pooler import CLSPool, PoolingType
+from vllm.model_executor.layers.pooler import CLSPool, MeanPool, PoolingType
 from vllm.model_executor.models.bert import BertEmbeddingModel
 from vllm.model_executor.models.roberta import RobertaEmbeddingModel
 from vllm.platforms import current_platform
@@ -14,7 +15,7 @@ MODEL_NAME = os.environ.get("MODEL_NAME", "BAAI/bge-base-en-v1.5")
 REVISION = os.environ.get("REVISION", "main")

 MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME",
-                                    "intfloat/multilingual-e5-small")
+                                    "intfloat/multilingual-e5-base")
 REVISION_ROBERTA = os.environ.get("REVISION", "main")


@@ -40,17 +41,15 @@ def test_model_loading_with_params(vllm_runner):

        # asserts on the pooling config files
        assert model_config.pooler_config.pooling_type == PoolingType.CLS.name
-        assert model_config.pooler_config.pooling_norm
+        assert model_config.pooler_config.normalize

        # asserts on the tokenizer loaded
        assert model_tokenizer.tokenizer_id == "BAAI/bge-base-en-v1.5"
-        assert model_tokenizer.tokenizer_config["do_lower_case"]
        assert model_tokenizer.tokenizer.model_max_length == 512

        def check_model(model):
            assert isinstance(model, BertEmbeddingModel)
-            assert model._pooler.pooling_type == PoolingType.CLS
-            assert model._pooler.normalize
+            assert isinstance(model._pooler, CLSPool)

        vllm_model.apply_model(check_model)

@@ -80,16 +79,15 @@ def test_roberta_model_loading_with_params(vllm_runner):

        # asserts on the pooling config files
        assert model_config.pooler_config.pooling_type == PoolingType.MEAN.name
-        assert model_config.pooler_config.pooling_norm
+        assert model_config.pooler_config.normalize

        # asserts on the tokenizer loaded
-        assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-small"
-        assert not model_tokenizer.tokenizer_config["do_lower_case"]
+        assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-base"
+        assert model_tokenizer.tokenizer.model_max_length == 512

        def check_model(model):
            assert isinstance(model, RobertaEmbeddingModel)
-            assert model._pooler.pooling_type == PoolingType.MEAN
-            assert model._pooler.normalize
+            assert isinstance(model._pooler, MeanPool)

        vllm_model.apply_model(check_model)


--- a/tests/model_executor/weight_utils.py
+++ b/tests/model_executor/weight_utils.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import os
 import tempfile

--- a/tests/models/language/generation/test_bart.py
+++ b/tests/models/language/generation/test_bart.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional

 import pytest

--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 from typing import Optional

@@ -86,7 +87,6 @@ AITER_MODEL_LIST = [
        pytest.param("bigcode/starcoder2-3b"),  # starcoder2
        pytest.param(
            "TitanML/tiny-mixtral",  # mixtral
-            marks=[pytest.mark.cpu_model],
        )
    ])
 @pytest.mark.parametrize("max_tokens", [32])

--- a/tests/models/language/generation/test_granite.py
+++ b/tests/models/language/generation/test_granite.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest

 from ...utils import check_logprobs_close

--- a/tests/models/language/generation/test_granitemoehybrid.py
+++ b/tests/models/language/generation/test_granitemoehybrid.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import pytest


--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import pytest