Merge tag 'v0.8.2' into v0.8.2-dev

469e903b · zhuwenwen · 389ebcf7 · 25f560a6 · 469e903b · 469e903b
Commit 469e903b authored Mar 28, 2025 by zhuwenwen
20 changed files
--- a/tests/distributed/test_pipeline_partition.py
+++ b/tests/distributed/test_pipeline_partition.py
@@ -7,30 +7,60 @@ import pytest
 from vllm.distributed.utils import get_pp_indices
-def test_custom_layer_partition():
+def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch):
-    def _verify(partition_str, num_layers, pp_size, goldens):
+    with monkeypatch.context() as m:
-        bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
-        os.environ["VLLM_PP_LAYER_PARTITION"] = partition_str
+        def _verify(partition_str, num_layers, pp_size, goldens):
-        for pp_rank, golden in enumerate(goldens):
+            bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
-            assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
+            m.setenv("VLLM_PP_LAYER_PARTITION", partition_str)
-        if bak is not None:
+            for pp_rank, golden in enumerate(goldens):
-            os.environ["VLLM_PP_LAYER_PARTITION"] = bak
+                assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
+            if bak is not None:
-    # Even partition
+                m.setenv("VLLM_PP_LAYER_PARTITION", bak)
-    _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
-    # Balanced partition
+        # Even partition
-    _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
+        _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
-    # Put reminder somewhere
+        # Balanced partition
-    _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
+        _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
-    # Invalid partition strings
+        # Put reminder somewhere
-    with pytest.raises(ValueError):
+        _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
-        _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+        # Invalid partition strings
-    with pytest.raises(ValueError):
+        with pytest.raises(ValueError):
-        _verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+            _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
-    # Wrong number of partitions
+        with pytest.raises(ValueError):
-    with pytest.raises(ValueError):
+            _verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
-        _verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+        # Wrong number of partitions
-    # Wrong number of layers
+        with pytest.raises(ValueError):
-    with pytest.raises(ValueError):
+            _verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
-        _verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+        # Wrong number of layers
+        with pytest.raises(ValueError):
+            _verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+@pytest.mark.parametrize(
+    "num_hidden_layers,pp_size,pp_rank,indices",
+    [
+        # pp_size 2
+        (2, 2, 0, (0, 1)),
+        (2, 2, 1, (1, 2)),
+        (3, 2, 0, (0, 2)),
+        (3, 2, 1, (2, 3)),
+        # pp_size 3
+        (3, 3, 0, (0, 1)),
+        (3, 3, 1, (1, 2)),
+        (3, 3, 2, (2, 3)),
+        (4, 3, 0, (0, 1)),
+        (4, 3, 1, (1, 3)),
+        (4, 3, 2, (3, 4)),
+        (5, 3, 0, (0, 2)),
+        (5, 3, 1, (2, 4)),
+        (5, 3, 2, (4, 5)),
+    ])
+def test_uneven_auto_partition(
+    num_hidden_layers: int,
+    pp_size: int,
+    pp_rank: int,
+    indices: tuple[int, int],
+):
+    assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size)
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
 # SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
-import os
+from typing import TYPE_CHECKING
+import os
 import pytest
-from ..utils import compare_two_settings, fork_new_process_for_each_test, models_path_prefix
+from ..utils import compare_two_settings, create_new_process_for_each_test, models_path_prefix
+if TYPE_CHECKING:
+    from typing_extensions import LiteralString
 @pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
@@ -14,19 +19,25 @@ from ..utils import compare_two_settings, fork_new_process_for_each_test, models
    "FLASH_ATTN",
    # "FLASHINFER",
 ])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
-def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
+def test_pp_cudagraph(
-    cudagraph_args = [
+    monkeypatch: pytest.MonkeyPatch,
-        # use half precision for speed and memory savings in CI environment
+    PP_SIZE: int,
-        "--dtype",
+    MODEL_NAME: str,
-        "float16",
+    ATTN_BACKEND: LiteralString,
-        "--pipeline-parallel-size",
+):
-        str(PP_SIZE),
+    with monkeypatch.context() as m:
-        "--distributed-executor-backend",
+        cudagraph_args = [
-        "mp",
+            # use half precision for speed and memory savings in CI environment
-    ]
+            "--dtype",
-    os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
+            "float16",
+            "--pipeline-parallel-size",
-    eager_args = cudagraph_args + ["--enforce-eager"]
+            str(PP_SIZE),
+            "--distributed-executor-backend",
-    compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
+            "mp",
+        ]
+        m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND)
+        eager_args = cudagraph_args + ["--enforce-eager"]
+        compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -2,7 +2,6 @@
 import multiprocessing
 import os
-from typing import Dict, List
 import pytest
 import torch
@@ -20,9 +19,9 @@ from vllm.utils import update_environment_variables
 def distributed_run(fn, world_size):
    number_of_processes = world_size
-    processes: List[multiprocessing.Process] = []
+    processes: list[multiprocessing.Process] = []
    for i in range(number_of_processes):
-        env: Dict[str, str] = {}
+        env: dict[str, str] = {}
        env['RANK'] = str(i)
        env['LOCAL_RANK'] = str(i)
        env['WORLD_SIZE'] = str(number_of_processes)

--- a/tests/distributed/test_shm_broadcast.py
+++ b/tests/distributed/test_shm_broadcast.py
@@ -3,7 +3,6 @@
 import multiprocessing
 import random
 import time
-from typing import List
 import numpy as np
 import torch.distributed as dist
@@ -13,7 +12,7 @@ from vllm.distributed.utils import StatelessProcessGroup
 from vllm.utils import get_ip, get_open_port, update_environment_variables
-def get_arrays(n: int, seed: int = 0) -> List[np.ndarray]:
+def get_arrays(n: int, seed: int = 0) -> list[np.ndarray]:
    np.random.seed(seed)
    sizes = np.random.randint(1, 10_000, n)
    # on average, each array will have 5k elements

--- a/tests/distributed/test_torchrun_example.py
+++ b/tests/distributed/test_torchrun_example.py
@@ -9,6 +9,8 @@ import torch.distributed as dist
 from vllm import LLM, SamplingParams
 from vllm.distributed.parallel_state import get_world_group
+dist.init_process_group(backend="gloo")
 # Create prompts
 prompts = [
    "Hello, my name is",
@@ -25,7 +27,8 @@ llm = LLM(model="facebook/opt-125m",
          tensor_parallel_size=2,
          distributed_executor_backend="external_launcher",
          gpu_memory_utilization=random.uniform(0.7, 0.9),
-          swap_space=random.randint(1, 4))
+          swap_space=random.randint(1, 4),
+          seed=0)
 outputs = llm.generate(prompts, sampling_params)
@@ -48,6 +51,12 @@ test_consistent_across_ranks(
 test_consistent_across_ranks(
    llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
+# make sure we can access the model parameters from the calling process
+# of the `LLM` instance.
+params = list(llm.llm_engine.model_executor.driver_worker.worker.model_runner.
+              model.parameters())
+test_consistent_across_ranks(len(params))
 # all ranks should have the same outputs
 for output in outputs:
    prompt = output.prompt

--- a/tests/encoder_decoder/test_e2e_correctness.py
+++ b/tests/encoder_decoder/test_e2e_correctness.py
@@ -3,7 +3,7 @@
 Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
 """
-from typing import List, Optional, Tuple
+from typing import Optional
 import pytest
 import os
@@ -17,7 +17,6 @@ from vllm.sequence import SampleLogprobs
 from ..conftest import DecoderPromptType
 from ..models.utils import check_logprobs_close
 from ..utils import models_path_prefix
-from vllm.utils import is_hip
 from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP
 LIST_ENC_DEC_SUPPORTED_BACKENDS = [
@@ -25,8 +24,17 @@ LIST_ENC_DEC_SUPPORTED_BACKENDS = [
 ]
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
 def vllm_to_hf_output(
-    vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]],
+    vllm_output: tuple[list[int], str, Optional[SampleLogprobs]],
    decoder_prompt_type: DecoderPromptType,
 ):
    """Sanitize vllm output to be comparable with hf output."""

--- a/tests/engine/conftest.py
+++ b/tests/engine/conftest.py
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
--- a/tests/engine/test_computed_prefix_blocks.py
+++ b/tests/engine/test_computed_prefix_blocks.py
@@ -2,16 +2,12 @@
 import pytest
-from vllm.config import LoadFormat
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm.sampling_params import SamplingParams
-from ..conftest import MODEL_WEIGHTS_S3_BUCKET
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
-@pytest.mark.parametrize("model",
-                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 @pytest.mark.parametrize("block_size", [16])
 def test_computed_prefix_blocks(model: str, block_size: int):
    # This test checks if we are able to run the engine to completion
@@ -28,7 +24,6 @@ def test_computed_prefix_blocks(model: str, block_size: int):
        "decoration.")
    engine_args = EngineArgs(model=model,
-                             load_format=LoadFormat.RUNAI_STREAMER,
                             block_size=block_size,
                             enable_prefix_caching=True)

--- a/tests/engine/test_executor.py
+++ b/tests/engine/test_executor.py
@@ -2,11 +2,10 @@
 import asyncio
 import os
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Optional, Union
 import pytest
-from vllm.config import LoadFormat
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.llm_engine import LLMEngine
@@ -15,10 +14,6 @@ from vllm.sampling_params import SamplingParams
 import os
 from ..utils import models_path_prefix
-from ..conftest import MODEL_WEIGHTS_S3_BUCKET
-RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
 class Mock:
    ...
@@ -29,8 +24,8 @@ class CustomUniExecutor(UniProcExecutor):
    def collective_rpc(self,
                       method: Union[str, Callable],
                       timeout: Optional[float] = None,
-                       args: Tuple = (),
+                       args: tuple = (),
-                       kwargs: Optional[Dict] = None) -> List[Any]:
+                       kwargs: Optional[dict] = None) -> list[Any]:
        # Drop marker to show that this was ran
        with open(".marker", "w"):
            ...
@@ -39,12 +34,10 @@ class CustomUniExecutor(UniProcExecutor):
 CustomUniExecutorAsync = CustomUniExecutor
-@pytest.mark.parametrize("model",
+@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
-                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_custom_executor_type_checking(model):
    with pytest.raises(ValueError):
        engine_args = EngineArgs(model=model,
-                                 load_format=RUNAI_STREAMER_LOAD_FORMAT,
                                 distributed_executor_backend=Mock)
        LLMEngine.from_engine_args(engine_args)
    with pytest.raises(ValueError):
@@ -53,8 +46,7 @@ def test_custom_executor_type_checking(model):
        AsyncLLMEngine.from_engine_args(engine_args)
-@pytest.mark.parametrize("model",
+@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
-                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_custom_executor(model, tmp_path):
    cwd = os.path.abspath(".")
    os.chdir(tmp_path)
@@ -63,7 +55,6 @@ def test_custom_executor(model, tmp_path):
        engine_args = EngineArgs(
            model=model,
-            load_format=RUNAI_STREAMER_LOAD_FORMAT,
            distributed_executor_backend=CustomUniExecutor,
            enforce_eager=True,  # reduce test time
        )
@@ -78,8 +69,7 @@ def test_custom_executor(model, tmp_path):
        os.chdir(cwd)
-@pytest.mark.parametrize("model",
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
-                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_custom_executor_async(model, tmp_path):
    cwd = os.path.abspath(".")
    os.chdir(tmp_path)
@@ -88,7 +78,6 @@ def test_custom_executor_async(model, tmp_path):
        engine_args = AsyncEngineArgs(
            model=model,
-            load_format=RUNAI_STREAMER_LOAD_FORMAT,
            distributed_executor_backend=CustomUniExecutorAsync,
            enforce_eager=True,  # reduce test time
        )
@@ -107,8 +96,7 @@ def test_custom_executor_async(model, tmp_path):
        os.chdir(cwd)
-@pytest.mark.parametrize("model",
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
-                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_respect_ray(model):
    # even for TP=1 and PP=1,
    # if users specify ray, we should use ray.
@@ -117,7 +105,6 @@ def test_respect_ray(model):
    engine_args = EngineArgs(
        model=model,
        distributed_executor_backend="ray",
-        load_format=RUNAI_STREAMER_LOAD_FORMAT,
        enforce_eager=True,  # reduce test time
    )
    engine = LLMEngine.from_engine_args(engine_args)

--- a/tests/engine/output_processor/test_multi_step.py
+++ b/tests/engine/output_processor/test_multi_step.py
@@ -15,7 +15,7 @@ from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
 from vllm.transformers_utils.detokenizer import Detokenizer
 from vllm.utils import Counter
-from ...core.utils import create_seq_group
+from ..core.utils import create_seq_group
 @pytest.mark.parametrize("seq_output_len", [128])

--- a/tests/engine/test_multiproc_workers.py
+++ b/tests/engine/test_multiproc_workers.py
@@ -4,7 +4,7 @@ import asyncio
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from time import sleep
-from typing import Any, List, Tuple
+from typing import Any
 import pytest
@@ -17,7 +17,7 @@ from vllm.worker.worker_base import WorkerWrapperBase
 class DummyWorkerWrapper(WorkerWrapperBase):
    """Dummy version of vllm.worker.worker.Worker"""
-    def worker_method(self, worker_input: Any) -> Tuple[int, Any]:
+    def worker_method(self, worker_input: Any) -> tuple[int, Any]:
        sleep(0.05)
        if isinstance(worker_input, Exception):
@@ -27,7 +27,7 @@ class DummyWorkerWrapper(WorkerWrapperBase):
        return self.rpc_rank, input
-def _start_workers() -> Tuple[List[ProcessWorkerWrapper], WorkerMonitor]:
+def _start_workers() -> tuple[list[ProcessWorkerWrapper], WorkerMonitor]:
    result_handler = ResultHandler()
    vllm_config = VllmConfig()
    workers = [

--- a/tests/engine/test_skip_tokenizer_init.py
+++ b/tests/engine/test_skip_tokenizer_init.py
@@ -2,22 +2,19 @@
 import pytest
-from vllm.config import LoadFormat
 from vllm.entrypoints.llm import LLM
 from vllm.sampling_params import SamplingParams
-from ..conftest import MODEL_WEIGHTS_S3_BUCKET
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
-@pytest.mark.parametrize("model",
-                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_skip_tokenizer_initialization(model: str):
    # This test checks if the flag skip_tokenizer_init skips the initialization
    # of tokenizer and detokenizer. The generated output is expected to contain
    # token ids.
-    llm = LLM(model=model,
+    llm = LLM(
-              skip_tokenizer_init=True,
+        model=model,
-              load_format=LoadFormat.RUNAI_STREAMER)
+        skip_tokenizer_init=True,
+    )
    sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
    with pytest.raises(ValueError, match="cannot pass text prompts when"):

--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -44,10 +44,14 @@ def run_test(more_args=None):
            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+# TODO: [AlexM] Fix it with new CI/CD tests
+TPU_TP_TEST_STR = ""  #"tensor_parallel_size=4"
 @pytest.mark.skipif(not current_platform.is_cuda()
                    and not current_platform.is_tpu(),
                    reason="V1 is currently only supported on CUDA and TPU")
-def test_lm_eval_accuracy_v1_engine(monkeypatch):
+def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
    """Run with the V1 Engine."""
    with monkeypatch.context() as m:
@@ -58,10 +62,14 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
            # Limit compilation time for TPU V1
            more_args = "max_num_seqs=64"
+            # Add TP test (if provided)
+            if TPU_TP_TEST_STR:
+                more_args += ",{}".format(TPU_TP_TEST_STR)
        run_test(more_args)
-def test_lm_eval_accuracy_v0_engine(monkeypatch):
+def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
    """Run with the V0 Engine."""
    with monkeypatch.context() as m:

--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
 # SPDX-License-Identifier: Apache-2.0
-from typing import List
 import os
 import pytest
 from vllm import LLM
-from vllm.config import LoadFormat
-from ...conftest import MODEL_WEIGHTS_S3_BUCKET
 from ..openai.test_vision import TEST_IMAGE_URLS
 from ...utils import models_path_prefix
-RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
 def test_chat():
-    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
+    llm = LLM(model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"))
-              load_format=RUNAI_STREAMER_LOAD_FORMAT)
    prompt1 = "Explain the concept of entropy."
    messages = [
@@ -35,8 +28,7 @@ def test_chat():
 def test_multi_chat():
-    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
+    llm = LLM(model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"))
-              load_format=RUNAI_STREAMER_LOAD_FORMAT)
    prompt1 = "Explain the concept of entropy."
    prompt2 = "Explain what among us is."
@@ -71,11 +63,9 @@ def test_multi_chat():
 @pytest.mark.parametrize("image_urls",
                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
-def test_chat_multi_image(image_urls: List[str]):
+def test_chat_multi_image(image_urls: list[str]):
    llm = LLM(
-        model=f"{MODEL_WEIGHTS_S3_BUCKET}/Phi-3.5-vision-instruct",
+        model="microsoft/Phi-3.5-vision-instruct",
-        load_format=RUNAI_STREAMER_LOAD_FORMAT,
-        dtype="bfloat16",
        max_model_len=4096,
        max_num_seqs=5,
        enforce_eager=True,

--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@@ -4,12 +4,12 @@ import pytest
 from vllm import LLM
-from ...utils import fork_new_process_for_each_test
+from ...utils import create_new_process_for_each_test
 @pytest.mark.parametrize("tp_size", [1, 2])
 @pytest.mark.parametrize("backend", ["mp", "ray"])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_collective_rpc(tp_size, backend):
    if tp_size == 1 and backend == "ray":
        pytest.skip("Skip duplicate test case")
@@ -21,18 +21,9 @@ def test_collective_rpc(tp_size, backend):
    def echo_rank(self):
        return self.rank
-    from vllm.worker.worker import Worker
+    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
-    class MyWorker(Worker):
-        def echo_rank(self):
-            return self.rank
-    llm = LLM(model="s3://vllm-ci-model-weights/Llama-3.2-1B-Instruct",
              enforce_eager=True,
              load_format="dummy",
              tensor_parallel_size=tp_size,
-              distributed_executor_backend=backend,
+              distributed_executor_backend=backend)
-              worker_cls=MyWorker)
+    assert llm.collective_rpc(echo_rank) == list(range(tp_size))
-    for method in ["echo_rank", echo_rank]:
-        assert llm.collective_rpc(method) == list(range(tp_size))
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
 # SPDX-License-Identifier: Apache-2.0
 import weakref
-from typing import List
 import pytest
 import os
 from vllm import LLM, PoolingParams, PoolingRequestOutput
-from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 from ...utils import models_path_prefix
-MODEL_NAME = os.path.join(models_path_prefix, "e5-mistral-7b-instruct")
+MODEL_NAME = os.path.join(models_path_prefix, "intfloat/multilingual-e5-small")
 PROMPTS = [
    "Hello, my name is",
@@ -35,11 +33,11 @@ def llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(model=MODEL_NAME,
-              load_format=LoadFormat.RUNAI_STREAMER,
              max_num_batched_tokens=32768,
              tensor_parallel_size=1,
              gpu_memory_utilization=0.75,
-              enforce_eager=True)
+              enforce_eager=True,
+              seed=0)
    with llm.deprecate_legacy_api():
        yield weakref.proxy(llm)
@@ -49,8 +47,8 @@ def llm():
    cleanup_dist_env_and_memory()
-def assert_outputs_equal(o1: List[PoolingRequestOutput],
+def assert_outputs_equal(o1: list[PoolingRequestOutput],
-                         o2: List[PoolingRequestOutput]):
+                         o2: list[PoolingRequestOutput]):
    assert [o.outputs for o in o1] == [o.outputs for o in o2]

--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
 # SPDX-License-Identifier: Apache-2.0
 import weakref
-from typing import List
 import os
 import pytest
 from vllm import LLM, RequestOutput, SamplingParams
-from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 from ...utils import models_path_prefix
-MODEL_NAME = os.path.join(models_path_prefix, "distilgpt2")
+MODEL_NAME = os.path.join(models_path_prefix, "distilbert/distilgpt2")
 PROMPTS = [
    "Hello, my name is",
@@ -33,7 +31,6 @@ def llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(model=MODEL_NAME,
-              load_format=LoadFormat.RUNAI_STREAMER,
              max_num_batched_tokens=4096,
              tensor_parallel_size=1,
              gpu_memory_utilization=0.10,
@@ -47,7 +44,7 @@ def llm():
    cleanup_dist_env_and_memory()
-def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]):
+def assert_outputs_equal(o1: list[RequestOutput], o2: list[RequestOutput]):
    assert [o.outputs for o in o1] == [o.outputs for o in o2]

--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -8,12 +8,11 @@ import os
 from huggingface_hub import snapshot_download
 from vllm import LLM
-from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.lora.request import LoRARequest
 from ...utils import models_path_prefix
-MODEL_NAME = os.path.join(models_path_prefix, "zephyr-7b-beta")
+MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
 PROMPTS = [
    "Hello, my name is",
@@ -30,7 +29,6 @@ def llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(model=MODEL_NAME,
-              load_format=LoadFormat.RUNAI_STREAMER,
              tensor_parallel_size=1,
              max_model_len=8192,
              enable_lora=True,

--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -7,8 +7,8 @@ import weakref
 import jsonschema
 import pytest
 import os
+from pydantic import BaseModel
-from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
@@ -17,16 +17,16 @@ from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from ...utils import models_path_prefix
 MODEL_NAME = os.path.join(models_path_prefix, "Qwen2.5-1.5B-Instruct")
-GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
+GUIDED_DECODING_BACKENDS = [
+    "outlines", "lm-format-enforcer", "xgrammar", "guidance"
+]
 @pytest.fixture(scope="module")
 def llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
-    llm = LLM(model=MODEL_NAME,
+    llm = LLM(model=MODEL_NAME, max_model_len=1024, seed=0)
-              load_format=LoadFormat.RUNAI_STREAMER,
-              max_model_len=1024)
    with llm.deprecate_legacy_api():
        yield weakref.proxy(llm)
@@ -283,6 +283,22 @@ def test_validation_against_both_guided_decoding_options(sample_regex, llm):
                     guided_options_request=dict(guided_regex=sample_regex))
+@pytest.mark.skip_global_cleanup
+def test_disable_guided_decoding_fallback(sample_regex, llm):
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     guided_decoding=GuidedDecodingParams(
+                                         regex=sample_regex,
+                                         backend="xgrammar:no-fallback"))
+    with pytest.raises(
+            ValueError,
+            match="xgrammar does not support regex guided decoding"):
+        llm.generate(prompts="This should fail",
+                     sampling_params=sampling_params,
+                     use_tqdm=True)
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 def test_guided_json_object(llm, guided_decoding_backend: str):
@@ -312,3 +328,56 @@ def test_guided_json_object(llm, guided_decoding_backend: str):
            # Parse to verify it is valid JSON
            parsed_json = json.loads(generated_text)
            assert isinstance(parsed_json, dict)
+@pytest.mark.skip_global_cleanup
+def test_json_with_any_whitespace_disabled(llm):
+    class ResponseSchema(BaseModel):
+        clarifying_question: str
+        cost_per_serving: str
+        calories: str
+        type_dish_ids: str
+        type_meal_ids: str
+        product_ids: list[str]
+        exclude_product_ids: list[str]
+        allergen_ids: list[str]
+        total_cooking_time: str
+        kitchen_ids: str
+        holiday_ids: str
+    # Note: Without this setting, the response is sometimes full of `\n`
+    # for some models. This option prevents that.
+    guided_decoding_backend = 'xgrammar:disable-any-whitespace'
+    schema = ResponseSchema.model_json_schema()
+    guided_params = GuidedDecodingParams(json=schema,
+                                         backend=\
+                                           guided_decoding_backend)
+    sampling_params = SamplingParams(max_tokens=2000,
+                                     frequency_penalty=0,
+                                     presence_penalty=-1.1,
+                                     repetition_penalty=1.3,
+                                     guided_decoding=guided_params)
+    prompt = ("<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You"
+              "are a helpful assistant.<|im_end|>\n<|im_start|>user\nI want a "
+              "quick launch fast with $10.<|im_end|>\n<|im_start|>assistant\n")
+    outputs = llm.generate(prompts=prompt,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        assert "\n" not in generated_text
+        # Parse to verify it is valid JSON
+        parsed_json = json.loads(generated_text)
+        assert isinstance(parsed_json, dict)
+        jsonschema.validate(instance=parsed_json, schema=schema)
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -4,14 +4,22 @@ import sys
 import os
 from contextlib import nullcontext
+import pytest
 from vllm_test_utils import BlameResult, blame
 from vllm import LLM, SamplingParams
-from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 from ...utils import models_path_prefix
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    V1 only supports xgrammar so this is irrelevant.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
 def run_normal_opt125m():
    prompts = [
        "Hello, my name is",
@@ -46,8 +54,7 @@ def run_normal():
    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
    # Create an LLM without guided decoding as a baseline.
-    llm = LLM(model="s3://vllm-ci-model-weights/distilgpt2",
+    llm = LLM(model="distilbert/distilgpt2",
-              load_format=LoadFormat.RUNAI_STREAMER,
              enforce_eager=True,
              gpu_memory_utilization=0.3)
    outputs = llm.generate(prompts, sampling_params)
@@ -63,8 +70,7 @@ def run_normal():
 def run_lmfe(sample_regex):
    # Create an LLM with guided decoding enabled.
-    llm = LLM(model=os.path.join(models_path_prefix, "distilgpt2"),
+    llm = LLM(model=os.path.join(models_path_prefix, "distilbert/distilgpt2"),
-              load_format=LoadFormat.RUNAI_STREAMER,
              enforce_eager=True,
              guided_decoding_backend="lm-format-enforcer",
              gpu_memory_utilization=0.3)