Merge tag 'v0.7.3' into v0.7.3-dev

ec5e299c · zhuwenwen · 47bd229c · ed6e9075 · ec5e299c · ec5e299c
Commit ec5e299c authored Feb 21, 2025 by zhuwenwen
20 changed files
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -26,7 +26,7 @@ from tests.models.utils import (TokensTextLogprobs,
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import TaskOption, TokenizerPoolConfig
+from vllm.config import LoadFormat, TaskOption, TokenizerPoolConfig
 from vllm.connections import global_http_connection
 from vllm.distributed import (cleanup_dist_env_and_memory,
                              init_distributed_environment,
@@ -49,6 +49,71 @@ _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
 _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
 _M = TypeVar("_M")
+MODELS_ON_S3 = [
+    "distilbert/distilgpt2",
+    "meta-llama/Llama-2-7b-hf",
+    "meta-llama/Meta-Llama-3-8B",
+    "meta-llama/Llama-3.2-1B",
+    "meta-llama/Llama-3.2-1B-Instruct",
+    "openai-community/gpt2",
+    "ArthurZ/Ilama-3.2-1B",
+    "llava-hf/llava-1.5-7b-hf",
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "ai21labs/Jamba-tiny-random",
+    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
+    "nm-testing/Phi-3-mini-128k-instruct-FP8",
+    "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
+    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
+    "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
+    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
+    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
+    "AMead10/Llama-3.2-1B-Instruct-AWQ",
+    "shuyuej/Llama-3.2-1B-Instruct-GPTQ",
+    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head",
+    "ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024",
+    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
+    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
+    "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
+    "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
+    "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
+    "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
+    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
+    "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
+    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
+    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
+    "nm-testing/tinyllama-oneshot-w4a16-channel-v2",
+    "nm-testing/tinyllama-oneshot-w4a16-group128-v2",
+    "nm-testing/tinyllama-oneshot-w8a16-per-channel",
+    "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
+    "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test",
+    "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
+    "nm-testing/llama2.c-stories42M-pruned2.4-compressed",
+]
+MODEL_WEIGHTS_S3_BUCKET = models_path_prefix
 _PromptMultiModalInput = Union[List[_M], List[List[_M]]]
 PromptImageInput = _PromptMultiModalInput[Image.Image]
@@ -680,8 +745,14 @@ class VllmRunner:
        enable_chunked_prefill: bool = False,
        swap_space: int = 4,
        enforce_eager: Optional[bool] = False,
+        load_format: Optional[LoadFormat] = None,
        **kwargs,
    ) -> None:
+        if model_name in MODELS_ON_S3 and not load_format:
+            model_name = (f"{MODEL_WEIGHTS_S3_BUCKET}/{model_name}")
+            load_format = LoadFormat.RUNAI_STREAMER
+        if not load_format:
+            load_format = LoadFormat.AUTO
        self.model = LLM(
            model=model_name,
            task=task,
@@ -696,6 +767,7 @@ class VllmRunner:
            max_model_len=max_model_len,
            block_size=block_size,
            enable_chunked_prefill=enable_chunked_prefill,
+            load_format=load_format,
            **kwargs,
        )

--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -7,6 +7,9 @@ import pytest  # noqa
 from vllm.config import CacheConfig, SchedulerConfig
 from vllm.core.scheduler import Scheduler
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.llm_engine import LLMEngine
+from vllm.sampling_params import SamplingParams
 from vllm.sequence import Logprob, SequenceGroup
 from .utils import create_dummy_prompt
@@ -16,7 +19,7 @@ def get_sequence_groups(scheduler_output):
    return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
-def append_new_token(seq_group, token_id: int):
+def append_new_token(seq_group: SequenceGroup, token_id: int):
    for seq in seq_group.get_seqs():
        seq.append_token_id(token_id, {token_id: Logprob(token_id)})
@@ -123,6 +126,232 @@ def test_chunk():
    assert out.num_batched_tokens == 57
+def test_concurrent_chunking():
+    """Verify prefills are chunked properly when 
+    --max-num-partial-prefills is > 1"""
+    block_size = 4
+    max_seqs = 60
+    max_model_len = 2000
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+        max_num_partial_prefills=2,  # Up to 2 partial prefills at a time
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 32
+    cache_config.num_gpu_blocks = 32
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+    # Add seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+    # Verify both requests are chunked with half of max_num_batched_tokens each
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    assert seq_group_meta[0].token_chunk_size == 32
+    assert seq_group_meta[1].token_chunk_size == 32
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 64
+    # After one iteration, both should have 60 - 32 = 28 tokens left to prefill
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    assert seq_group_meta[0].token_chunk_size == 28
+    assert seq_group_meta[1].token_chunk_size == 28
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 56
+def test_concurrent_chunking_large_requests():
+    """Verify large prefill requests are run one at a time"""
+    block_size = 4
+    max_seqs = 60
+    max_model_len = 2000
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+        max_num_partial_prefills=2,  # Up to 2 partial prefills at a time
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 3200  # large KV cache size for large requests
+    cache_config.num_gpu_blocks = 3200
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    # Add seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(
+            str(i),
+            prompt_length=1200,  # Very large prompt
+            block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+    # Verify only a single request is chunked, and it gets all 64 tokens
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(get_sequence_groups(out)) == 1
+    assert seq_group_meta[0].token_chunk_size == 64
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 64
+def test_short_prompts_jump_long_prompts_in_queue():
+    """Verify large prefill requests are punted behind smaller ones if 
+    another large prefill request is already running"""
+    block_size = 4
+    max_seqs = 60
+    max_model_len = 2000
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+        max_num_partial_prefills=2,  # Up to 2 partial prefills at a time
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 3200  # large KV cache size for large requests
+    cache_config.num_gpu_blocks = 3200
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    long_seqs: List[SequenceGroup] = []
+    short_seqs: List[SequenceGroup] = []
+    # Add 2 large seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(
+            str(i),
+            prompt_length=1200,  # Very large prompt
+            block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        long_seqs.append(seq_group)
+        assert seq_group.is_prefill()
+    # Add 2 small seq groups behind them
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(
+            str(i + 2),
+            prompt_length=40,  # Very small prompt
+            block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        short_seqs.append(seq_group)
+        assert seq_group.is_prefill()
+    # Verify one large req and 1 small req chunked
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert seq_group_meta[0].token_chunk_size == 32  # large req gets 32 tokens
+    assert seq_group_meta[1].token_chunk_size == 32  # small req gets 32 tokens
+    # all 4 are prefilling
+    assert long_seqs[0].is_prefill()
+    assert long_seqs[1].is_prefill()
+    assert short_seqs[0].is_prefill()
+    assert short_seqs[1].is_prefill()
+    # First short and first long sequences have been scheduled
+    assert long_seqs[0].first_seq.get_num_computed_tokens() == 32
+    assert long_seqs[1].first_seq.get_num_computed_tokens() == 0
+    assert short_seqs[0].first_seq.get_num_computed_tokens() == 32
+    assert short_seqs[1].first_seq.get_num_computed_tokens() == 0
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 64
+    # in the second iteration,
+    # the first small request had only 8 tokens left
+    # so it went to decode
+    # The other small req is scheduled
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    # the new small req got 64 - (32+8) tokens
+    assert seq_group_meta[0].token_chunk_size == 24
+    assert seq_group_meta[1].token_chunk_size == 32  # large req still got 32
+    # the other small request had only 8 tokens left
+    assert seq_group_meta[2].token_chunk_size == 8  # 40-32
+    # The first small request got to decode now
+    assert long_seqs[0].is_prefill()
+    assert long_seqs[1].is_prefill()
+    assert not short_seqs[0].is_prefill()
+    assert short_seqs[1].is_prefill()
+    # Both small requests have started in front of the second long request
+    assert long_seqs[0].first_seq.get_num_computed_tokens() == 64
+    assert long_seqs[1].first_seq.get_num_computed_tokens() == 0
+    assert short_seqs[0].first_seq.get_num_computed_tokens() == 40
+    assert short_seqs[1].first_seq.get_num_computed_tokens() == 24
+    assert out.num_prefill_groups == 3
+    assert out.num_batched_tokens == 64
+    # the first small seq group has a new token appended.
+    append_new_token(short_seqs[0], 1)
+    # in the third iteration,
+    # the first small request is already decoding
+    # the second small request only has 16 tokens left and will enter decoding
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert seq_group_meta[0].token_chunk_size == 32  # large still got 32
+    # small req finished prefilling 40-24=16 tokens
+    assert seq_group_meta[1].token_chunk_size == 16
+    assert seq_group_meta[2].token_chunk_size == 1  # decode
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 49  # (32+16+1 decode)
+    # both small requests have now reached decode
+    assert long_seqs[0].is_prefill()
+    assert long_seqs[1].is_prefill()
+    assert not short_seqs[0].is_prefill()
+    assert not short_seqs[1].is_prefill()
+    assert long_seqs[0].first_seq.get_num_computed_tokens() == 96
+    assert long_seqs[1].first_seq.get_num_computed_tokens() == 0
+    assert short_seqs[0].first_seq.get_num_computed_tokens() == 41
+    assert short_seqs[1].first_seq.get_num_computed_tokens() == 40
+    # both the small seq groups have a new token appended
+    append_new_token(short_seqs[0], 1)
+    append_new_token(short_seqs[1], 1)
+    # in the fourth iteration, both small requests are decoding
+    # so large request gets all the budget
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    # large req gets 62 tokens (minus 2 for decode)
+    assert seq_group_meta[0].token_chunk_size == 62
+    assert seq_group_meta[1].token_chunk_size == 1  # decode
+    assert seq_group_meta[2].token_chunk_size == 1  # decode
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 64
+    assert long_seqs[0].first_seq.get_num_computed_tokens() == 158
+    # assert long_seqs[0].is_prefill()
+    # assert long_seqs[1].is_prefill()
+    # assert not short_seqs[0].is_prefill()
+    # assert not short_seqs[1].is_prefill()
+    # # both the small seq groups have a new token appended
+    # append_new_token(short_seqs[0], 1)
+    # append_new_token(short_seqs[1], 1)
+    # # in the fifth iteration, large request gets all the budget
+    # # while both small requests are decoding
+    # seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    # assert seq_group_meta[0].token_chunk_size == 62
+    # assert seq_group_meta[1].token_chunk_size == 1  # decode
+    # assert seq_group_meta[2].token_chunk_size == 1  # decode
+    # assert out.num_prefill_groups == 1
+    # assert out.num_batched_tokens == 64
 def test_complex():
    block_size = 4
    max_seqs = 60
@@ -508,7 +737,7 @@ def test_chunked_prefill_max_seqs():
    assert not running[1].is_prefill()
-def test_perfix_caching():
+def test_prefix_caching():
    """Verify allocating full blocks when prefix caching is enabled."""
    block_size = 4
    max_seqs = 10
@@ -548,3 +777,86 @@ def test_perfix_caching():
    assert seq_group_meta[1].token_chunk_size == 12
    assert out.num_prefill_groups == 2
    assert out.num_batched_tokens == 62
+def test_prefix_caching_with_concurrent_partial_prefills():
+    """Verify allocating full blocks when prefix caching is enabled with 
+    --max-num-partial-prefills > 1."""
+    block_size = 4
+    max_seqs = 10
+    max_model_len = 8000
+    max_num_batched_tokens = 60  # With two slots, each slot will get 30 tokens
+    scheduler_config = SchedulerConfig("generate",
+                                       max_num_batched_tokens,
+                                       max_seqs,
+                                       max_model_len,
+                                       enable_chunked_prefill=True,
+                                       max_num_partial_prefills=2)
+    cache_config = CacheConfig(block_size,
+                               1.0,
+                               1,
+                               "auto",
+                               enable_prefix_caching=True)
+    cache_config.num_cpu_blocks = 0
+    cache_config.num_gpu_blocks = 32
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+    # Add seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           block_size=block_size,
+                                           prompt_length=50)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    # To partially prefill both sequences, both can chunk up to 30 tokens
+    # But the next lowest multiple of the block size (4) is 28
+    assert seq_group_meta[0].token_chunk_size == 28
+    assert seq_group_meta[1].token_chunk_size == 28
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 56
+    # On the next iteration, both sequences should finish prefill
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    # Both sequences have 50 - 28 = 22 tokens left to prefill.
+    # This is not a multiple of the block size, but we don't care since we don't
+    # cache the final partial block of prefix sequences
+    assert seq_group_meta[0].token_chunk_size == 22
+    assert seq_group_meta[1].token_chunk_size == 22
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 44
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("max_num_partial_prefills", [2, 4, 8])
+def test_chunked_prefill_with_actual_engine(model: str,
+                                            max_num_partial_prefills: int):
+    """Make sure the model can actually sample with concurrent 
+    partial prefills
+    """
+    prompt = "hello" * 40
+    engine_args = EngineArgs(
+        model=model,
+        max_num_partial_prefills=max_num_partial_prefills,
+        max_num_batched_tokens=40,
+        max_num_seqs=8,
+        enable_chunked_prefill=True,
+        gpu_memory_utilization=0.8,
+    )
+    engine = LLMEngine.from_engine_args(engine_args)
+    sampling_params = SamplingParams(temperature=0)
+    for req_num in range(max_num_partial_prefills):
+        engine.add_request(f"{req_num}", prompt, sampling_params)
+    # first step
+    request_outputs = engine.step()
+    # means all are prefilling
+    assert len(request_outputs) == 0
+    assert len(engine.scheduler[0].running) == max_num_partial_prefills
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -22,7 +22,7 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
    # so that each worker can see all the GPUs
    # they will be able to set the device to the correct GPU
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@@ -44,7 +44,7 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
    # so that each worker can see all the GPUs
    # they will be able to set the device to the correct GPU
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@@ -72,7 +72,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
    # so that each worker can see all the GPUs
    # they will be able to set the device to the correct GPU
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@@ -108,7 +108,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
 @ray.remote(num_gpus=1, max_calls=1)
 def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
                                      distributed_init_port: str):
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@@ -148,7 +148,7 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
 @ray.remote(num_gpus=1, max_calls=1)
 def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
                          distributed_init_port: str):
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,

--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -24,7 +24,7 @@ for i, v in enumerate(test_sizes):
 @ray.remote(num_gpus=1, max_calls=1)
 def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@@ -80,7 +80,7 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
 @ray.remote(num_gpus=1, max_calls=1)
 def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,

--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -6,6 +6,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
 all workers in a node other than the head node, which can cause the test
 to fail.
 """
+import json
 import os
 from dataclasses import dataclass
 from typing import List, Literal, NamedTuple, Optional
@@ -15,6 +16,7 @@ import pytest
 from vllm.config import TaskOption
 from vllm.logger import init_logger
+from ..models.registry import HF_EXAMPLE_MODELS
 from ..utils import compare_two_settings, fork_new_process_for_each_test, models_path_prefix
 logger = init_logger("test_pipeline_parallel")
@@ -31,19 +33,29 @@ class ParallelSetup(NamedTuple):
 class PPTestOptions(NamedTuple):
    multi_node_only: bool
-    trust_remote_code: bool
-    tokenizer_mode: Optional[str]
    load_format: Optional[str] = None
-    hf_overrides: Optional[str] = None
 @dataclass
 class PPTestSettings:
    parallel_setups: List[ParallelSetup]
+    # NOTE: the length of distributed_backends and
+    # vllm_major_versions should be the same, and they
+    # are first zipped together to iterate over all
+    # test settings.
    distributed_backends: List[str]
+    # vllm major version: "0" for V0, "1" for V1
+    vllm_major_versions: List[str]
    task: TaskOption
    test_options: PPTestOptions
+    def __post_init__(self):
+        if len(self.distributed_backends) != len(self.vllm_major_versions):
+            raise ValueError(
+                f"Length mismatch: distributed_backends "
+                f"({len(self.distributed_backends)}) != "
+                f"vllm_major_versions ({len(self.vllm_major_versions)})")
    @staticmethod
    def detailed(
        *,
@@ -51,10 +63,7 @@ class PPTestSettings:
        pp_base: int = 2,
        multi_node_only: bool = False,
        task: TaskOption = "auto",
-        trust_remote_code: bool = False,
-        tokenizer_mode: Optional[str] = None,
        load_format: Optional[str] = None,
-        hf_overrides: Optional[str] = None,
    ):
        return PPTestSettings(
            parallel_setups=[
@@ -79,13 +88,12 @@ class PPTestSettings:
                              eager_mode=True,
                              chunked_prefill=False),
            ],
-            distributed_backends=["mp", "ray"],
+            # only ray is supported for V1
+            distributed_backends=["mp", "ray", "ray"],
+            vllm_major_versions=["0", "0", "1"],
            task=task,
            test_options=PPTestOptions(multi_node_only=multi_node_only,
-                                       trust_remote_code=trust_remote_code,
+                                       load_format=load_format),
-                                       tokenizer_mode=tokenizer_mode,
-                                       load_format=load_format,
-                                       hf_overrides=hf_overrides),
        )
    @staticmethod
@@ -95,10 +103,7 @@ class PPTestSettings:
        pp_base: int = 2,
        task: TaskOption = "auto",
        multi_node_only: bool = False,
-        trust_remote_code: bool = False,
-        tokenizer_mode: Optional[str] = None,
        load_format: Optional[str] = None,
-        hf_overrides: Optional[str] = None,
    ):
        return PPTestSettings(
            parallel_setups=[
@@ -108,20 +113,19 @@ class PPTestSettings:
                              chunked_prefill=False),
            ],
            distributed_backends=["mp"],
+            vllm_major_versions=["0"],
            task=task,
            test_options=PPTestOptions(multi_node_only=multi_node_only,
-                                       trust_remote_code=trust_remote_code,
+                                       load_format=load_format),
-                                       tokenizer_mode=tokenizer_mode,
-                                       load_format=load_format,
-                                       hf_overrides=hf_overrides),
        )
-    def iter_params(self, model_name: str):
+    def iter_params(self, model_id: str):
        opts = self.test_options
        for parallel_setup in self.parallel_setups:
-            for distributed_backend in self.distributed_backends:
+            for backend, vllm_major_version in zip(self.distributed_backends,
-                yield (model_name, parallel_setup, distributed_backend,
+                                                   self.vllm_major_versions):
+                yield (model_id, parallel_setup, backend, vllm_major_version,
                       self.task, opts)
@@ -133,16 +137,16 @@ TEXT_GENERATION_MODELS = {
    # [Decoder-only]
    # Uses Llama
    # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
-    os.path.join(models_path_prefix, "Snowflake/snowflake-arctic-instruct"): PPTestSettings.fast(tp_base=8, trust_remote_code=True),  # noqa: E501
+    os.path.join(models_path_prefix, "Snowflake/snowflake-arctic-instruct"): PPTestSettings.fast(load_format="dummy"),  # noqa: E501
-    os.path.join(models_path_prefix, "baichuan-inc/Baichuan-7B"): PPTestSettings.fast(trust_remote_code=True),
+    os.path.join(models_path_prefix, "baichuan-inc/Baichuan-7B"): PPTestSettings.fast(),
-    os.path.join(models_path_prefix, "baichuan-inc/Baichuan2-13B-Chat"): PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    os.path.join(models_path_prefix, "baichuan-inc/Baichuan2-13B-Chat"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "bigscience/bloomz-1b1"): PPTestSettings.fast(),
-    os.path.join(models_path_prefix, "THUDM/chatglm3-6b"): PPTestSettings.fast(trust_remote_code=True),
+    os.path.join(models_path_prefix, "THUDM/chatglm3-6b"): PPTestSettings.fast(),
-    os.path.join(models_path_prefix, "CohereForAI/c4ai-command-r-v01"): PPTestSettings.fast(tp_base=2, trust_remote_code=True),  # noqa: E501
+    os.path.join(models_path_prefix, "CohereForAI/c4ai-command-r-v01"): PPTestSettings.fast(load_format="dummy"),
-    os.path.join(models_path_prefix, "databricks/dbrx-instruct"): PPTestSettings.fast(tp_base=8),
+    os.path.join(models_path_prefix, "databricks/dbrx-instruct"): PPTestSettings.fast(load_format="dummy"),
-    os.path.join(models_path_prefix, "Deci/DeciLM-7B-instruct"): PPTestSettings.fast(trust_remote_code=True),
+    os.path.join(models_path_prefix, "Deci/DeciLM-7B-instruct"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "deepseek-ai/deepseek-llm-7b-chat"): PPTestSettings.fast(),
-    os.path.join(models_path_prefix, "deepseek-ai/DeepSeek-V2-Lite-Chat"): PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    os.path.join(models_path_prefix, "deepseek-ai/DeepSeek-V2-Lite-Chat"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "tiiuae/falcon-7b"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "google/gemma-2b"): PPTestSettings.fast(),
@@ -155,36 +159,36 @@ TEXT_GENERATION_MODELS = {
    os.path.join(models_path_prefix, "ibm/PowerMoE-3b"): PPTestSettings.fast(),
    # Uses Llama
    # "internlm/internlm-chat-7b": PPTestSettings.fast(),
-    os.path.join(models_path_prefix, "internlm/internlm2-chat-7b"): PPTestSettings.fast(trust_remote_code=True),
+    os.path.join(models_path_prefix, "internlm/internlm2-chat-7b"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "inceptionai/jais-13b-chat"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-dev"): PPTestSettings.fast(),
-    os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"): PPTestSettings.detailed(),
+    os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"): PPTestSettings.detailed(),
-    os.path.join(models_path_prefix, "openbmb/MiniCPM-2B-sft-bf16"): PPTestSettings.fast(trust_remote_code=True),
+    os.path.join(models_path_prefix, "openbmb/MiniCPM-2B-sft-bf16"): PPTestSettings.fast(),
-    os.path.join(models_path_prefix, "openbmb/MiniCPM3-4B"): PPTestSettings.fast(trust_remote_code=True),
+    os.path.join(models_path_prefix, "openbmb/MiniCPM3-4B"): PPTestSettings.fast(),
    # Uses Llama
    # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
    os.path.join(models_path_prefix, "state-spaces/mamba-130m-hf"): PPTestSettings.fast(),
-    os.path.join(models_path_prefix, "mistralai/Mixtral-8x7B-Instruct-v0.1"): PPTestSettings.fast(tp_base=4),
+    os.path.join(models_path_prefix, "mistralai/Mixtral-8x7B-Instruct-v0.1"): PPTestSettings.fast(load_format="dummy"),  # noqa: E501
    os.path.join(models_path_prefix, "mosaicml/mpt-7b"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "nvidia/Minitron-8B-Base"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "allenai/OLMo-1B-hf"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "shanearora/OLMo-7B-1124-hf"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "allenai/OLMoE-1B-7B-0924-Instruct"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "facebook/opt-iml-max-1.3b"): PPTestSettings.fast(),
-    os.path.join(models_path_prefix, "OrionStarAI/Orion-14B-Chat"): PPTestSettings.fast(trust_remote_code=True),
+    os.path.join(models_path_prefix, "OrionStarAI/Orion-14B-Chat"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "adept/persimmon-8b-chat"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "microsoft/phi-2"): PPTestSettings.fast(),
-    os.path.join(models_path_prefix, "microsoft/Phi-3-small-8k-instruct"): PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    os.path.join(models_path_prefix, "microsoft/Phi-3-small-8k-instruct"): PPTestSettings.fast(),
-    os.path.join(models_path_prefix, "microsoft/Phi-3.5-MoE-instruct"): PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True, load_format="dummy", hf_overrides='{"num_hidden_layers": 4, "hidden_size": 512, "intermediate_size": 800, "num_attention_heads": 4, "num_key_value_heads": 1}'),  # noqa: E501
+    os.path.join(models_path_prefix, "microsoft/Phi-3.5-MoE-instruct"): PPTestSettings.detailed(multi_node_only=True, load_format="dummy"),  # noqa: E501
-    os.path.join(models_path_prefix, "Qwen/Qwen-7B-Chat"): PPTestSettings.fast(trust_remote_code=True),
+    os.path.join(models_path_prefix, "Qwen/Qwen-7B-Chat"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "Qwen/Qwen2-7B-Instruct"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "Qwen/Qwen1.5-MoE-A2.7B-Chat"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "stabilityai/stablelm-3b-4e1t"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "bigcode/starcoder2-3b"): PPTestSettings.fast(),
-    os.path.join(models_path_prefix, "upstage/solar-pro-preview-instruct"): PPTestSettings.fast(tp_base=2),
+    os.path.join(models_path_prefix, "upstage/solar-pro-preview-instruct"): PPTestSettings.fast(load_format="dummy"),  # noqa: E501
    # FIXME: Cannot load tokenizer in latest transformers version.
    # Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
-    # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
+    # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(),
    # [Encoder-only]
    # TODO: Implement PP
    # "facebook/bart-base": PPTestSettings.fast(),
@@ -192,9 +196,9 @@ TEXT_GENERATION_MODELS = {
 EMBEDDING_MODELS = {  # type: ignore[var-annotated]
    # [Text-only]
-    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
+    os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"): PPTestSettings.fast(),
-    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
+    os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2"): PPTestSettings.fast(),
-    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(tp_base=4, trust_remote_code=True),  # noqa: E501
+    os.path.join(models_path_prefix, "Qwen/Qwen2.5-Math-RM-72B"): PPTestSettings.fast(load_format="dummy"),
 }
 MULTIMODAL_MODELS = {
@@ -202,20 +206,20 @@ MULTIMODAL_MODELS = {
    os.path.join(models_path_prefix, "Salesforce/blip2-opt-2.7b"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "facebook/chameleon-7b"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "adept/fuyu-8b"): PPTestSettings.fast(),
-    os.path.join(models_path_prefix, "THUDM/glm-4v-9b"): PPTestSettings.fast(trust_remote_code=True),
+    os.path.join(models_path_prefix, "THUDM/glm-4v-9b"): PPTestSettings.fast(),
-    os.path.join(models_path_prefix, "OpenGVLab/InternVL2-1B"): PPTestSettings.fast(trust_remote_code=True),
+    os.path.join(models_path_prefix, "OpenGVLab/InternVL2-1B"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "llava-hf/LLaVA-NeXT-Video-7B-hf"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"): PPTestSettings.fast(),
-    os.path.join(models_path_prefix, "openbmb/MiniCPM-Llama3-V-2_5"): PPTestSettings.fast(trust_remote_code=True),
+    os.path.join(models_path_prefix, "openbmb/MiniCPM-Llama3-V-2_5"): PPTestSettings.fast(),
-    os.path.join(models_path_prefix, "allenai/Molmo-7B-D-0924"): PPTestSettings.fast(trust_remote_code=True),
+    os.path.join(models_path_prefix, "allenai/Molmo-7B-D-0924"): PPTestSettings.fast(),
-    os.path.join(models_path_prefix, "microsoft/Phi-3-vision-128k-instruct"): PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    os.path.join(models_path_prefix, "microsoft/Phi-3-vision-128k-instruct"): PPTestSettings.fast(),
-    os.path.join(models_path_prefix, "mistralai/Pixtral-12B-2409"): PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"),  # noqa: E501
+    os.path.join(models_path_prefix, "mistralai/Pixtral-12B-2409"): PPTestSettings.fast(load_format="dummy"),
-    os.path.join(models_path_prefix, "Qwen/Qwen-VL-Chat"): PPTestSettings.fast(trust_remote_code=True),
+    os.path.join(models_path_prefix, "Qwen/Qwen-VL-Chat"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "Qwen/Qwen2-Audio-7B-Instruct"): PPTestSettings.fast(),
    os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct"): PPTestSettings.fast(),
-    os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_3"): PPTestSettings.fast(trust_remote_code=True),
+    os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_5-llama-3_2-1b"): PPTestSettings.fast(),
    # [Encoder-decoder]
    # TODO: Implement PP
    # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
@@ -226,7 +230,7 @@ MULTIMODAL_MODELS = {
 TEST_MODELS = [
    # [LANGUAGE GENERATION]
    os.path.join(models_path_prefix, "microsoft/Phi-3.5-MoE-instruct"),
-    os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"),
+    os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
    os.path.join(models_path_prefix, "ibm/PowerLM-3b"),
    # [LANGUAGE EMBEDDING]
    os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"),
@@ -234,21 +238,23 @@ TEST_MODELS = [
    # [MULTIMODAL GENERATION]
    os.path.join(models_path_prefix, "OpenGVLab/InternVL2-1B"),
    os.path.join(models_path_prefix, "microsoft/Phi-3-vision-128k-instruct"),
-    os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_3"),
+    os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_5-llama-3_2-1b"),
    # [LANGUAGE GENERATION - HYBRID ARCH]
    os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-dev"),
 ]
 def _compare_tp(
-    model_name: str,
+    model_id: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
+    vllm_major_version: str,
    task: TaskOption,
    test_options: PPTestOptions,
    num_gpus_available: int,
    *,
    method: Literal["generate", "encode"],
+    is_multimodal: bool,
 ):
    (
        tp_size,
@@ -256,13 +262,32 @@ def _compare_tp(
        eager_mode,
        chunked_prefill,
    ) = parallel_setup
-    (
-        multi_node_only,
+    multi_node_only, load_format = test_options
-        trust_remote_code,
-        tokenizer_mode,
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
-        load_format,
+    model_info.check_transformers_version(on_fail="skip")
-        hf_overrides,
-    ) = test_options
+    trust_remote_code = model_info.trust_remote_code
+    tokenizer_mode = model_info.tokenizer_mode
+    hf_overrides = model_info.hf_overrides
+    if load_format == "dummy":
+        # Avoid OOM
+        text_overrides = {
+            "num_hidden_layers": 4,
+            "hidden_size": 512,
+            "intermediate_size": 800,
+            "num_attention_heads": 4,
+            "num_key_value_heads": 1,
+        }
+        if is_multimodal:
+            hf_overrides.update({"text_config": text_overrides})
+        else:
+            hf_overrides.update(text_overrides)
+    else:
+        model_info.check_available_online(on_fail="skip")
    if num_gpus_available < tp_size * pp_size:
        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
@@ -294,12 +319,15 @@ def _compare_tp(
    if load_format:
        common_args.extend(["--load-format", load_format])
    if hf_overrides:
-        common_args.extend(["--hf-overrides", hf_overrides])
+        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
-    if (distributed_backend == "ray" and tp_size == 2 and pp_size == 2
+    specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
-            and chunked_prefill):
+    if distributed_backend == "ray" and (vllm_major_version == "1"
-        # Test Ray ADAG for a subset of the tests
+                                         or specific_case):
+        # For V1, test Ray ADAG for all the tests
+        # For V0, test Ray ADAG for a subset of the tests
        pp_env = {
+            "VLLM_USE_V1": vllm_major_version,
            "VLLM_USE_RAY_COMPILED_DAG": "1",
            "VLLM_USE_RAY_SPMD_WORKER": "1",
            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
@@ -334,11 +362,7 @@ def _compare_tp(
    ]
    try:
-        compare_two_settings(model_name,
+        compare_two_settings(model_id, pp_args, tp_args, pp_env, method=method)
-                             pp_args,
-                             tp_args,
-                             pp_env,
-                             method=method)
    except Exception:
        if pp_env is None:
            raise
@@ -348,81 +372,87 @@ def _compare_tp(
 @pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend", "task",
+    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
-     "test_options"),
+     "task", "test_options"),
    [
-        params for model_name, settings in TEXT_GENERATION_MODELS.items()
+        params for model_id, settings in TEXT_GENERATION_MODELS.items()
-        for params in settings.iter_params(model_name)
+        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
-        if model_name in TEST_MODELS
    ],
 )
 @fork_new_process_for_each_test
 def test_tp_language_generation(
-    model_name: str,
+    model_id: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
+    vllm_major_version: str,
    task: TaskOption,
    test_options: PPTestOptions,
    num_gpus_available,
 ):
-    _compare_tp(model_name,
+    _compare_tp(model_id,
                parallel_setup,
                distributed_backend,
+                vllm_major_version,
                task,
                test_options,
                num_gpus_available,
-                method="generate")
+                method="generate",
+                is_multimodal=False)
 @pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend", "task",
+    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
-     "test_options"),
+     "task", "test_options"),
    [
-        params for model_name, settings in EMBEDDING_MODELS.items()
+        params for model_id, settings in EMBEDDING_MODELS.items()
-        for params in settings.iter_params(model_name)
+        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
-        if model_name in TEST_MODELS
    ],
 )
 @fork_new_process_for_each_test
 def test_tp_language_embedding(
-    model_name: str,
+    model_id: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
+    vllm_major_version: str,
    task: TaskOption,
    test_options: PPTestOptions,
    num_gpus_available,
 ):
-    _compare_tp(model_name,
+    _compare_tp(model_id,
                parallel_setup,
                distributed_backend,
+                vllm_major_version,
                task,
                test_options,
                num_gpus_available,
-                method="encode")
+                method="encode",
+                is_multimodal=False)
 @pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend", "task",
+    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
-     "test_options"),
+     "task", "test_options"),
    [
-        params for model_name, settings in MULTIMODAL_MODELS.items()
+        params for model_id, settings in MULTIMODAL_MODELS.items()
-        for params in settings.iter_params(model_name)
+        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
-        if model_name in TEST_MODELS
    ],
 )
 @fork_new_process_for_each_test
 def test_tp_multimodal_generation(
-    model_name: str,
+    model_id: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
+    vllm_major_version: str,
    task: TaskOption,
    test_options: PPTestOptions,
    num_gpus_available,
 ):
-    _compare_tp(model_name,
+    _compare_tp(model_id,
                parallel_setup,
                distributed_backend,
+                vllm_major_version,
                task,
                test_options,
                num_gpus_available,
-                method="generate")
+                method="generate",
+                is_multimodal=True)
--- a/tests/engine/test_computed_prefix_blocks.py
+++ b/tests/engine/test_computed_prefix_blocks.py
@@ -2,14 +2,16 @@
 import pytest
+from vllm.config import LoadFormat
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm.sampling_params import SamplingParams
-import os
-from ..utils import models_path_prefix
+from ..conftest import MODEL_WEIGHTS_S3_BUCKET
-@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 @pytest.mark.parametrize("block_size", [16])
 def test_computed_prefix_blocks(model: str, block_size: int):
    # This test checks if we are able to run the engine to completion
@@ -26,6 +28,7 @@ def test_computed_prefix_blocks(model: str, block_size: int):
        "decoration.")
    engine_args = EngineArgs(model=model,
+                             load_format=LoadFormat.RUNAI_STREAMER,
                             block_size=block_size,
                             enable_prefix_caching=True)

--- a/tests/engine/test_detokenization.py
+++ b/tests/engine/test_detokenization.py
@@ -2,13 +2,15 @@
 import pytest
+from vllm.config import LoadFormat
 from vllm.entrypoints.llm import LLM
 from vllm.sampling_params import SamplingParams
-import os
-from ..utils import models_path_prefix
+from ..conftest import MODEL_WEIGHTS_S3_BUCKET
-@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_computed_prefix_blocks(model: str):
    # This test checks if the engine generates completions both with and
    # without optional detokenization, that detokenization includes text
@@ -19,7 +21,7 @@ def test_computed_prefix_blocks(model: str):
        "paper clips? Is there an easy to follow video tutorial available "
        "online for free?")
-    llm = LLM(model=model)
+    llm = LLM(model=model, load_format=LoadFormat.RUNAI_STREAMER)
    sampling_params = SamplingParams(max_tokens=10,
                                     temperature=0.0,
                                     detokenize=False)

--- a/tests/engine/test_custom_executor.py
+++ b/tests/engine/test_custom_executor.py
@@ -6,6 +6,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 import pytest
+from vllm.config import LoadFormat
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.llm_engine import LLMEngine
@@ -14,6 +15,10 @@ from vllm.sampling_params import SamplingParams
 import os
 from ..utils import models_path_prefix
+from ..conftest import MODEL_WEIGHTS_S3_BUCKET
+RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
 class Mock:
    ...
@@ -34,11 +39,12 @@ class CustomUniExecutor(UniProcExecutor):
 CustomUniExecutorAsync = CustomUniExecutor
+@pytest.mark.parametrize("model",
-@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_custom_executor_type_checking(model):
    with pytest.raises(ValueError):
        engine_args = EngineArgs(model=model,
+                                 load_format=RUNAI_STREAMER_LOAD_FORMAT,
                                 distributed_executor_backend=Mock)
        LLMEngine.from_engine_args(engine_args)
    with pytest.raises(ValueError):
@@ -47,7 +53,8 @@ def test_custom_executor_type_checking(model):
        AsyncLLMEngine.from_engine_args(engine_args)
-@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_custom_executor(model, tmp_path):
    cwd = os.path.abspath(".")
    os.chdir(tmp_path)
@@ -56,7 +63,9 @@ def test_custom_executor(model, tmp_path):
        engine_args = EngineArgs(
            model=model,
+            load_format=RUNAI_STREAMER_LOAD_FORMAT,
            distributed_executor_backend=CustomUniExecutor,
+            enforce_eager=True,  # reduce test time
        )
        engine = LLMEngine.from_engine_args(engine_args)
        sampling_params = SamplingParams(max_tokens=1)
@@ -69,7 +78,8 @@ def test_custom_executor(model, tmp_path):
        os.chdir(cwd)
-@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_custom_executor_async(model, tmp_path):
    cwd = os.path.abspath(".")
    os.chdir(tmp_path)
@@ -77,7 +87,11 @@ def test_custom_executor_async(model, tmp_path):
        assert not os.path.exists(".marker")
        engine_args = AsyncEngineArgs(
-            model=model, distributed_executor_backend=CustomUniExecutorAsync)
+            model=model,
+            load_format=RUNAI_STREAMER_LOAD_FORMAT,
+            distributed_executor_backend=CustomUniExecutorAsync,
+            enforce_eager=True,  # reduce test time
+        )
        engine = AsyncLLMEngine.from_engine_args(engine_args)
        sampling_params = SamplingParams(max_tokens=1)
@@ -91,3 +105,20 @@ def test_custom_executor_async(model, tmp_path):
        assert os.path.exists(".marker")
    finally:
        os.chdir(cwd)
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
+def test_respect_ray(model):
+    # even for TP=1 and PP=1,
+    # if users specify ray, we should use ray.
+    # users might do this if they want to manage the
+    # resources using ray.
+    engine_args = EngineArgs(
+        model=model,
+        distributed_executor_backend="ray",
+        load_format=RUNAI_STREAMER_LOAD_FORMAT,
+        enforce_eager=True,  # reduce test time
+    )
+    engine = LLMEngine.from_engine_args(engine_args)
+    assert engine.model_executor.uses_ray
--- a/tests/engine/test_skip_tokenizer_init.py
+++ b/tests/engine/test_skip_tokenizer_init.py
@@ -2,18 +2,22 @@
 import pytest
+from vllm.config import LoadFormat
 from vllm.entrypoints.llm import LLM
 from vllm.sampling_params import SamplingParams
-import os
-from ..utils import models_path_prefix
+from ..conftest import MODEL_WEIGHTS_S3_BUCKET
-@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_skip_tokenizer_initialization(model: str):
    # This test checks if the flag skip_tokenizer_init skips the initialization
    # of tokenizer and detokenizer. The generated output is expected to contain
    # token ids.
-    llm = LLM(model=model, skip_tokenizer_init=True)
+    llm = LLM(model=model,
+              skip_tokenizer_init=True,
+              load_format=LoadFormat.RUNAI_STREAMER)
    sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
    with pytest.raises(ValueError, match="cannot pass text prompts when"):

--- a/tests/engine/test_stop_reason.py
+++ b/tests/engine/test_stop_reason.py
@@ -14,7 +14,7 @@ import transformers
 from vllm import SamplingParams
 from ..utils import models_path_prefix
-MODEL = os.path.join(models_path_prefix, "facebook/opt-350m")
+MODEL = os.path.join(models_path_prefix, "distilbert/distilgpt2")
 STOP_STR = "."
 SEED = 42
 MAX_TOKENS = 1024

--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -141,6 +141,47 @@ def sample_definition_json_schema():
    }
+@pytest.fixture
+def sample_enum_json_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "status": {
+                "type": "string",
+                "enum": ["active", "inactive",
+                         "pending"]  # Literal values using enum
+            },
+            "priority": {
+                "type": "string",
+                "enum": ["low", "medium", "high", "critical"]
+            },
+            "category": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "enum": ["bug", "feature", "improvement"]
+                    },
+                    "severity": {
+                        "type": "integer",
+                        "enum": [1, 2, 3, 4,
+                                 5]  # Enum can also contain numbers
+                    }
+                },
+                "required": ["type", "severity"]
+            },
+            "flags": {
+                "type": "array",
+                "items": {
+                    "type": "string",
+                    "enum": ["urgent", "blocked", "needs_review", "approved"]
+                }
+            }
+        },
+        "required": ["status", "priority", "category", "flags"]
+    }
 @pytest.fixture
 def sample_guided_choice():
    return [

--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -23,10 +23,13 @@ RTOL = 0.03
 EXPECTED_VALUE = 0.58
-def run_test():
+def run_test(more_args=None):
    """Run the end to end accuracy test."""
-    model_args = f"pretrained={MODEL_NAME},max_model_len=2048"
+    model_args = f"pretrained={MODEL_NAME},max_model_len=4096"
+    if more_args is not None:
+        model_args = "{},{}".format(model_args, more_args)
    results = lm_eval.simple_evaluate(
        model="vllm",
@@ -41,14 +44,21 @@ def run_test():
            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
-@pytest.mark.skipif(not current_platform.is_cuda(),
+@pytest.mark.skipif(not current_platform.is_cuda()
-                    reason="V1 is currently only supported on CUDA.")
+                    and not current_platform.is_tpu(),
+                    reason="V1 is currently only supported on CUDA and TPU")
 def test_lm_eval_accuracy_v1_engine(monkeypatch):
    """Run with the V1 Engine."""
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
-        run_test()
+        more_args = None
+        if current_platform.is_tpu():
+            # Limit compilation time for TPU V1
+            more_args = "max_num_seqs=64"
+        run_test(more_args)
 def test_lm_eval_accuracy_v0_engine(monkeypatch):

--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -6,13 +6,18 @@ import os
 import pytest
 from vllm import LLM
+from vllm.config import LoadFormat
+from ...conftest import MODEL_WEIGHTS_S3_BUCKET
 from ..openai.test_vision import TEST_IMAGE_URLS
 from ...utils import models_path_prefix
+RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
 def test_chat():
-    llm = LLM(model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"))
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
+              load_format=RUNAI_STREAMER_LOAD_FORMAT)
    prompt1 = "Explain the concept of entropy."
    messages = [
@@ -30,7 +35,8 @@ def test_chat():
 def test_multi_chat():
-    llm = LLM(model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"))
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
+              load_format=RUNAI_STREAMER_LOAD_FORMAT)
    prompt1 = "Explain the concept of entropy."
    prompt2 = "Explain what among us is."
@@ -67,7 +73,8 @@ def test_multi_chat():
                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
 def test_chat_multi_image(image_urls: List[str]):
    llm = LLM(
-        model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"),
+        model=f"{MODEL_WEIGHTS_S3_BUCKET}/Phi-3.5-vision-instruct",
+        load_format=RUNAI_STREAMER_LOAD_FORMAT,
        dtype="bfloat16",
        max_model_len=4096,
        max_num_seqs=5,

--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@@ -28,7 +28,7 @@ def test_collective_rpc(tp_size, backend):
        def echo_rank(self):
            return self.rank
-    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
+    llm = LLM(model="s3://vllm-ci-model-weights/Llama-3.2-1B-Instruct",
              enforce_eager=True,
              load_format="dummy",
              tensor_parallel_size=tp_size,

--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -7,10 +7,11 @@ import pytest
 import os
 from vllm import LLM, PoolingParams, PoolingRequestOutput
+from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 from ...utils import models_path_prefix
-MODEL_NAME = os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")
+MODEL_NAME = os.path.join(models_path_prefix, "e5-mistral-7b-instruct")
 PROMPTS = [
    "Hello, my name is",
@@ -34,6 +35,7 @@ def llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(model=MODEL_NAME,
+              load_format=LoadFormat.RUNAI_STREAMER,
              max_num_batched_tokens=32768,
              tensor_parallel_size=1,
              gpu_memory_utilization=0.75,

--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -7,10 +7,11 @@ import os
 import pytest
 from vllm import LLM, RequestOutput, SamplingParams
+from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 from ...utils import models_path_prefix
-MODEL_NAME = os.path.join(models_path_prefix, "facebook/opt-125m")
+MODEL_NAME = os.path.join(models_path_prefix, "distilgpt2")
 PROMPTS = [
    "Hello, my name is",
@@ -32,6 +33,7 @@ def llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(model=MODEL_NAME,
+              load_format=LoadFormat.RUNAI_STREAMER,
              max_num_batched_tokens=4096,
              tensor_parallel_size=1,
              gpu_memory_utilization=0.10,

--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -8,11 +8,12 @@ import os
 from huggingface_hub import snapshot_download
 from vllm import LLM
+from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.lora.request import LoRARequest
 from ...utils import models_path_prefix
-MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
+MODEL_NAME = os.path.join(models_path_prefix, "zephyr-7b-beta")
 PROMPTS = [
    "Hello, my name is",
@@ -29,6 +30,7 @@ def llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(model=MODEL_NAME,
+              load_format=LoadFormat.RUNAI_STREAMER,
              tensor_parallel_size=1,
              max_model_len=8192,
              enable_lora=True,

--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -8,6 +8,7 @@ import jsonschema
 import pytest
 import os
+from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
@@ -15,7 +16,7 @@ from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from ...utils import models_path_prefix
-MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2.5-7B-Instruct")
+MODEL_NAME = os.path.join(models_path_prefix, "Qwen2.5-1.5B-Instruct")
 GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
@@ -23,7 +24,9 @@ GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
 def llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
-    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    llm = LLM(model=MODEL_NAME,
+              load_format=LoadFormat.RUNAI_STREAMER,
+              max_model_len=1024)
    with llm.deprecate_legacy_api():
        yield weakref.proxy(llm)
@@ -149,6 +152,47 @@ def test_guided_definition_json_completion(sample_definition_json_schema, llm,
                            schema=sample_definition_json_schema)
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_enum_json_completion(sample_enum_json_schema, llm,
+                                     guided_decoding_backend: str):
+    sampling_params = SamplingParams(temperature=1.0,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         json=sample_enum_json_schema,
+                                         backend=guided_decoding_backend))
+    outputs = llm.generate(prompts=[
+        "Create a bug report JSON that fits this schema: "
+        f"{sample_enum_json_schema}. Make it for a high priority critical bug."
+    ] * 2,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json,
+                            schema=sample_enum_json_schema)
+        # Additional assertions to verify enum values
+        assert output_json["status"] in ["active", "inactive", "pending"]
+        assert output_json["priority"] in ["low", "medium", "high", "critical"]
+        assert output_json["category"]["type"] in [
+            "bug", "feature", "improvement"
+        ]
+        assert output_json["category"]["severity"] in [1, 2, 3, 4, 5]
+        for flag in output_json["flags"]:
+            assert flag in ["urgent", "blocked", "needs_review", "approved"]
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 def test_guided_choice_completion(sample_guided_choice, llm,

--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -7,11 +7,12 @@ from contextlib import nullcontext
 from vllm_test_utils import BlameResult, blame
 from vllm import LLM, SamplingParams
+from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 from ...utils import models_path_prefix
-def run_normal():
+def run_normal_opt125m():
    prompts = [
        "Hello, my name is",
        "The president of the United States is",
@@ -35,9 +36,35 @@ def run_normal():
    cleanup_dist_env_and_memory()
+def run_normal():
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    # Create an LLM without guided decoding as a baseline.
+    llm = LLM(model="s3://vllm-ci-model-weights/distilgpt2",
+              load_format=LoadFormat.RUNAI_STREAMER,
+              enforce_eager=True,
+              gpu_memory_utilization=0.3)
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    # Destroy the LLM object and free up the GPU memory.
+    del llm
+    cleanup_dist_env_and_memory()
 def run_lmfe(sample_regex):
    # Create an LLM with guided decoding enabled.
-    llm = LLM(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
+    llm = LLM(model=os.path.join(models_path_prefix, "distilgpt2"),
+              load_format=LoadFormat.RUNAI_STREAMER,
              enforce_eager=True,
              guided_decoding_backend="lm-format-enforcer",
              gpu_memory_utilization=0.3)

--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
@@ -5,6 +5,7 @@ import os
 from vllm import LLM
 from ...utils import models_path_prefix
+from vllm.config import LoadFormat
 @pytest.fixture(autouse=True)
@@ -16,13 +17,17 @@ def v1(run_with_both_engines):
 def test_empty_prompt():
-    llm = LLM(model=os.path.join(models_path_prefix, "gpt2"), enforce_eager=True)
+    llm = LLM(model=os.path.join(models_path_prefix, "gpt2"), 
+              load_format=LoadFormat.RUNAI_STREAMER,
+              enforce_eager=True)
    with pytest.raises(ValueError, match='Prompt cannot be empty'):
        llm.generate([""])
 @pytest.mark.skip_v1
 def test_out_of_vocab_token():
-    llm = LLM(model=os.path.join(models_path_prefix, "gpt2"), enforce_eager=True)
+    llm = LLM(model=os.path.join(models_path_prefix, "gpt2"),
+              load_format=LoadFormat.RUNAI_STREAMER,
+              enforce_eager=True)
    with pytest.raises(ValueError, match='out of vocabulary'):
        llm.generate({"prompt_token_ids": [999999]})