[1/n][CI] Load models in CI from S3 instead of HF (#13205)

Signed-off-by: <> Co-authored-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>

[1/n][CI] Load models in CI from S3 instead of HF (#13205)
Signed-off-by: <> Co-authored-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
d5d214ac · Kevin H. Luu · GitHub · fd84857f · d5d214ac · d5d214ac
Unverified Commit d5d214ac authored Feb 18, 2025 by Kevin H. Luu Committed by GitHub Feb 19, 2025
20 changed files
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -37,3 +37,5 @@ genai_perf==0.0.8
 tritonclient==2.51.0

 numpy < 2.0.0
+runai-model-streamer==0.11.0
+runai-model-streamer-s3==0.11.0
\ No newline at end of file
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -171,6 +171,8 @@ huggingface-hub==0.26.2
    #   tokenizers
    #   transformers
    #   vocos
+humanize==4.11.0
+    # via runai-model-streamer
 idna==3.10
    # via
    #   anyio
@@ -290,6 +292,7 @@ numpy==1.26.4
    #   patsy
    #   peft
    #   rouge-score
+    #   runai-model-streamer
    #   sacrebleu
    #   scikit-learn
    #   scipy
@@ -514,6 +517,10 @@ rpds-py==0.20.1
    #   referencing
 rsa==4.7.2
    # via awscli
+runai-model-streamer==0.11.0
+    # via -r requirements-test.in
+runai-model-streamer-s3==0.11.0
+    # via -r requirements-test.in
 s3transfer==0.10.3
    # via
    #   awscli
@@ -594,6 +601,7 @@ torch==2.5.1
    #   encodec
    #   lm-eval
    #   peft
+    #   runai-model-streamer
    #   sentence-transformers
    #   tensorizer
    #   timm

--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -9,6 +9,7 @@ import weakref
 import pytest

 from vllm import LLM
+from vllm.config import LoadFormat
 from vllm.platforms import current_platform

 from ..conftest import VllmRunner
@@ -33,7 +34,7 @@ def v1(run_with_both_engines):

 def test_vllm_gc_ed():
    """Verify vllm instance is GC'ed when it is deleted"""
-    llm = LLM("facebook/opt-125m")
+    llm = LLM("distilbert/distilgpt2", load_format=LoadFormat.RUNAI_STREAMER)
    weak_llm = weakref.ref(llm)
    del llm
    # If there's any circular reference to vllm, this fails
@@ -94,14 +95,14 @@ def test_models(
 @pytest.mark.parametrize(
    "model, distributed_executor_backend, attention_backend, "
    "test_suite", [
-        ("facebook/opt-125m", "ray", "", "L4"),
-        ("facebook/opt-125m", "mp", "", "L4"),
-        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
-        ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
-        ("facebook/opt-125m", "ray", "", "A100"),
-        ("facebook/opt-125m", "mp", "", "A100"),
-        ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
-        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "FLASHINFER", "A100"),
+        ("distilbert/distilgpt2", "ray", "", "L4"),
+        ("distilbert/distilgpt2", "mp", "", "L4"),
+        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
+        ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
+        ("distilbert/distilgpt2", "ray", "", "A100"),
+        ("distilbert/distilgpt2", "mp", "", "A100"),
+        ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
+        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
    ])
 def test_models_distributed(
    hf_runner,

--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -4,9 +4,11 @@ import pytest
 import torch

 from vllm import LLM, SamplingParams
+from vllm.config import LoadFormat
 from vllm.device_allocator.cumem import CuMemAllocator
 from vllm.utils import GiB_bytes

+from ..conftest import MODEL_WEIGHTS_S3_BUCKET
 from ..utils import fork_new_process_for_each_test


@@ -118,13 +120,18 @@ def test_cumem_with_cudagraph():
 @pytest.mark.parametrize(
    "model",
    [
-        "meta-llama/Llama-3.2-1B-Instruct",  # sleep mode with safetensors
-        "facebook/opt-125m"  # sleep mode with pytorch checkpoint
+        # sleep mode with safetensors
+        f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B",
+        # sleep mode with pytorch checkpoint
+        "facebook/opt-125m"
    ])
 def test_end_to_end(model):
    free, total = torch.cuda.mem_get_info()
    used_bytes_baseline = total - free  # in case other process is running
-    llm = LLM(model, enable_sleep_mode=True)
+    load_format = LoadFormat.AUTO
+    if "Llama" in model:
+        load_format = LoadFormat.RUNAI_STREAMER
+    llm = LLM(model, load_format=load_format, enable_sleep_mode=True)
    prompt = "How are you?"
    sampling_params = SamplingParams(temperature=0, max_tokens=10)
    output = llm.generate(prompt, sampling_params)

--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -17,7 +17,7 @@ from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
 from ..models.utils import check_outputs_equal

 MODELS = [
-    "facebook/opt-125m",
+    "distilbert/distilgpt2",
 ]



--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -24,7 +24,7 @@ from tests.models.utils import (TokensTextLogprobs,
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import TaskOption, TokenizerPoolConfig
+from vllm.config import LoadFormat, TaskOption, TokenizerPoolConfig
 from vllm.connections import global_http_connection
 from vllm.distributed import (cleanup_dist_env_and_memory,
                              init_distributed_environment,
@@ -46,6 +46,21 @@ _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
 _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")

 _M = TypeVar("_M")
+
+MODELS_ON_S3 = [
+    "distilbert/distilgpt2",
+    "meta-llama/Llama-2-7b-hf",
+    "meta-llama/Meta-Llama-3-8B",
+    "meta-llama/Llama-3.2-1B",
+    "meta-llama/Llama-3.2-1B-Instruct",
+    "openai-community/gpt2",
+    "ArthurZ/Ilama-3.2-1B",
+    "llava-hf/llava-1.5-7b-hf",
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+]
+
+MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"
+
 _PromptMultiModalInput = Union[List[_M], List[List[_M]]]

 PromptImageInput = _PromptMultiModalInput[Image.Image]
@@ -677,8 +692,15 @@ class VllmRunner:
        enable_chunked_prefill: bool = False,
        swap_space: int = 4,
        enforce_eager: Optional[bool] = False,
+        load_format: Optional[LoadFormat] = None,
        **kwargs,
    ) -> None:
+        if model_name in MODELS_ON_S3 and not load_format:
+            model_name = (f"s3://vllm-ci-model-weights/"
+                          f"{model_name.split('/')[-1]}")
+            load_format = LoadFormat.RUNAI_STREAMER
+        if not load_format:
+            load_format = LoadFormat.AUTO
        self.model = LLM(
            model=model_name,
            task=task,
@@ -693,6 +715,7 @@ class VllmRunner:
            max_model_len=max_model_len,
            block_size=block_size,
            enable_chunked_prefill=enable_chunked_prefill,
+            load_format=load_format,
            **kwargs,
        )


--- a/tests/engine/test_computed_prefix_blocks.py
+++ b/tests/engine/test_computed_prefix_blocks.py
@@ -2,12 +2,15 @@

 import pytest

+from vllm.config import LoadFormat
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm.sampling_params import SamplingParams

+from ..conftest import MODEL_WEIGHTS_S3_BUCKET

-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 @pytest.mark.parametrize("block_size", [16])
 def test_computed_prefix_blocks(model: str, block_size: int):
    # This test checks if we are able to run the engine to completion
@@ -24,6 +27,7 @@ def test_computed_prefix_blocks(model: str, block_size: int):
        "decoration.")

    engine_args = EngineArgs(model=model,
+                             load_format=LoadFormat.RUNAI_STREAMER,
                             block_size=block_size,
                             enable_prefix_caching=True)


--- a/tests/engine/test_detokenization.py
+++ b/tests/engine/test_detokenization.py
@@ -2,11 +2,14 @@

 import pytest

+from vllm.config import LoadFormat
 from vllm.entrypoints.llm import LLM
 from vllm.sampling_params import SamplingParams

+from ..conftest import MODEL_WEIGHTS_S3_BUCKET

-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 def test_computed_prefix_blocks(model: str):
    # This test checks if the engine generates completions both with and
    # without optional detokenization, that detokenization includes text
@@ -17,7 +20,7 @@ def test_computed_prefix_blocks(model: str):
        "paper clips? Is there an easy to follow video tutorial available "
        "online for free?")

-    llm = LLM(model=model)
+    llm = LLM(model=model, load_format=LoadFormat.RUNAI_STREAMER)
    sampling_params = SamplingParams(max_tokens=10,
                                     temperature=0.0,
                                     detokenize=False)

--- a/tests/engine/test_executor.py
+++ b/tests/engine/test_executor.py
@@ -6,12 +6,17 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union

 import pytest

+from vllm.config import LoadFormat
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.llm_engine import LLMEngine
 from vllm.executor.uniproc_executor import UniProcExecutor
 from vllm.sampling_params import SamplingParams

+from ..conftest import MODEL_WEIGHTS_S3_BUCKET
+
+RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
+

 class Mock:
    ...
@@ -33,10 +38,11 @@ class CustomUniExecutor(UniProcExecutor):
 CustomUniExecutorAsync = CustomUniExecutor


-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 def test_custom_executor_type_checking(model):
    with pytest.raises(ValueError):
        engine_args = EngineArgs(model=model,
+                                 load_format=RUNAI_STREAMER_LOAD_FORMAT,
                                 distributed_executor_backend=Mock)
        LLMEngine.from_engine_args(engine_args)
    with pytest.raises(ValueError):
@@ -45,7 +51,7 @@ def test_custom_executor_type_checking(model):
        AsyncLLMEngine.from_engine_args(engine_args)


-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 def test_custom_executor(model, tmp_path):
    cwd = os.path.abspath(".")
    os.chdir(tmp_path)
@@ -54,6 +60,7 @@ def test_custom_executor(model, tmp_path):

        engine_args = EngineArgs(
            model=model,
+            load_format=RUNAI_STREAMER_LOAD_FORMAT,
            distributed_executor_backend=CustomUniExecutor,
            enforce_eager=True,  # reduce test time
        )
@@ -68,7 +75,7 @@ def test_custom_executor(model, tmp_path):
        os.chdir(cwd)


-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 def test_custom_executor_async(model, tmp_path):
    cwd = os.path.abspath(".")
    os.chdir(tmp_path)
@@ -77,6 +84,7 @@ def test_custom_executor_async(model, tmp_path):

        engine_args = AsyncEngineArgs(
            model=model,
+            load_format=RUNAI_STREAMER_LOAD_FORMAT,
            distributed_executor_backend=CustomUniExecutorAsync,
            enforce_eager=True,  # reduce test time
        )
@@ -95,7 +103,7 @@ def test_custom_executor_async(model, tmp_path):
        os.chdir(cwd)


-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 def test_respect_ray(model):
    # even for TP=1 and PP=1,
    # if users specify ray, we should use ray.
@@ -104,6 +112,7 @@ def test_respect_ray(model):
    engine_args = EngineArgs(
        model=model,
        distributed_executor_backend="ray",
+        load_format=RUNAI_STREAMER_LOAD_FORMAT,
        enforce_eager=True,  # reduce test time
    )
    engine = LLMEngine.from_engine_args(engine_args)

--- a/tests/engine/test_skip_tokenizer_init.py
+++ b/tests/engine/test_skip_tokenizer_init.py
@@ -2,16 +2,21 @@

 import pytest

+from vllm.config import LoadFormat
 from vllm.entrypoints.llm import LLM
 from vllm.sampling_params import SamplingParams

+from ..conftest import MODEL_WEIGHTS_S3_BUCKET

-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 def test_skip_tokenizer_initialization(model: str):
    # This test checks if the flag skip_tokenizer_init skips the initialization
    # of tokenizer and detokenizer. The generated output is expected to contain
    # token ids.
-    llm = LLM(model=model, skip_tokenizer_init=True)
+    llm = LLM(model=model,
+              skip_tokenizer_init=True,
+              load_format=LoadFormat.RUNAI_STREAMER)
    sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)

    with pytest.raises(ValueError, match="cannot pass text prompts when"):

--- a/tests/engine/test_stop_reason.py
+++ b/tests/engine/test_stop_reason.py
@@ -12,7 +12,7 @@ import transformers

 from vllm import SamplingParams

-MODEL = "facebook/opt-350m"
+MODEL = "distilbert/distilgpt2"
 STOP_STR = "."
 SEED = 42
 MAX_TOKENS = 1024

--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -5,12 +5,17 @@ from typing import List
 import pytest

 from vllm import LLM
+from vllm.config import LoadFormat

+from ...conftest import MODEL_WEIGHTS_S3_BUCKET
 from ..openai.test_vision import TEST_IMAGE_URLS

+RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
+

 def test_chat():
-    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
+              load_format=RUNAI_STREAMER_LOAD_FORMAT)

    prompt1 = "Explain the concept of entropy."
    messages = [
@@ -28,7 +33,8 @@ def test_chat():


 def test_multi_chat():
-    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
+              load_format=RUNAI_STREAMER_LOAD_FORMAT)

    prompt1 = "Explain the concept of entropy."
    prompt2 = "Explain what among us is."
@@ -65,7 +71,8 @@ def test_multi_chat():
                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
 def test_chat_multi_image(image_urls: List[str]):
    llm = LLM(
-        model="microsoft/Phi-3.5-vision-instruct",
+        model=f"{MODEL_WEIGHTS_S3_BUCKET}/Phi-3.5-vision-instruct",
+        load_format=RUNAI_STREAMER_LOAD_FORMAT,
        dtype="bfloat16",
        max_model_len=4096,
        max_num_seqs=5,

--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@@ -28,7 +28,7 @@ def test_collective_rpc(tp_size, backend):
        def echo_rank(self):
            return self.rank

-    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
+    llm = LLM(model="s3://vllm-ci-model-weights/Llama-3.2-1B-Instruct",
              enforce_eager=True,
              load_format="dummy",
              tensor_parallel_size=tp_size,

--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -6,9 +6,10 @@ from typing import List
 import pytest

 from vllm import LLM, PoolingParams, PoolingRequestOutput
+from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory

-MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
+MODEL_NAME = "s3://vllm-ci-model-weights/e5-mistral-7b-instruct"

 PROMPTS = [
    "Hello, my name is",
@@ -32,6 +33,7 @@ def llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(model=MODEL_NAME,
+              load_format=LoadFormat.RUNAI_STREAMER,
              max_num_batched_tokens=32768,
              tensor_parallel_size=1,
              gpu_memory_utilization=0.75,

--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -6,9 +6,10 @@ from typing import List
 import pytest

 from vllm import LLM, RequestOutput, SamplingParams
+from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory

-MODEL_NAME = "facebook/opt-125m"
+MODEL_NAME = "s3://vllm-ci-model-weights/distilgpt2"

 PROMPTS = [
    "Hello, my name is",
@@ -30,6 +31,7 @@ def llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(model=MODEL_NAME,
+              load_format=LoadFormat.RUNAI_STREAMER,
              max_num_batched_tokens=4096,
              tensor_parallel_size=1,
              gpu_memory_utilization=0.10,

--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -7,10 +7,11 @@ import pytest
 from huggingface_hub import snapshot_download

 from vllm import LLM
+from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.lora.request import LoRARequest

-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "s3://vllm-ci-model-weights/zephyr-7b-beta"

 PROMPTS = [
    "Hello, my name is",
@@ -27,6 +28,7 @@ def llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(model=MODEL_NAME,
+              load_format=LoadFormat.RUNAI_STREAMER,
              tensor_parallel_size=1,
              max_model_len=8192,
              enable_lora=True,

--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -7,12 +7,13 @@ import weakref
 import jsonschema
 import pytest

+from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams

-MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
+MODEL_NAME = "s3://vllm-ci-model-weights/Qwen2.5-1.5B-Instruct"
 GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]


@@ -20,7 +21,9 @@ GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
 def llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
-    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    llm = LLM(model=MODEL_NAME,
+              load_format=LoadFormat.RUNAI_STREAMER,
+              max_model_len=1024)

    with llm.deprecate_legacy_api():
        yield weakref.proxy(llm)

--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -6,10 +6,11 @@ from contextlib import nullcontext
 from vllm_test_utils import BlameResult, blame

 from vllm import LLM, SamplingParams
+from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory


-def run_normal():
+def run_normal_opt125m():
    prompts = [
        "Hello, my name is",
        "The president of the United States is",
@@ -33,9 +34,35 @@ def run_normal():
    cleanup_dist_env_and_memory()


+def run_normal():
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    # Create an LLM without guided decoding as a baseline.
+    llm = LLM(model="s3://vllm-ci-model-weights/distilgpt2",
+              load_format=LoadFormat.RUNAI_STREAMER,
+              enforce_eager=True,
+              gpu_memory_utilization=0.3)
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    # Destroy the LLM object and free up the GPU memory.
+    del llm
+    cleanup_dist_env_and_memory()
+
+
 def run_lmfe(sample_regex):
    # Create an LLM with guided decoding enabled.
-    llm = LLM(model="facebook/opt-125m",
+    llm = LLM(model="s3://vllm-ci-model-weights/distilgpt2",
+              load_format=LoadFormat.RUNAI_STREAMER,
              enforce_eager=True,
              guided_decoding_backend="lm-format-enforcer",
              gpu_memory_utilization=0.3)

--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
@@ -3,6 +3,7 @@
 import pytest

 from vllm import LLM
+from vllm.config import LoadFormat


 @pytest.fixture(autouse=True)
@@ -14,13 +15,17 @@ def v1(run_with_both_engines):


 def test_empty_prompt():
-    llm = LLM(model="gpt2", enforce_eager=True)
+    llm = LLM(model="s3://vllm-ci-model-weights/gpt2",
+              load_format=LoadFormat.RUNAI_STREAMER,
+              enforce_eager=True)
    with pytest.raises(ValueError, match='Prompt cannot be empty'):
        llm.generate([""])


 @pytest.mark.skip_v1
 def test_out_of_vocab_token():
-    llm = LLM(model="gpt2", enforce_eager=True)
+    llm = LLM(model="s3://vllm-ci-model-weights/gpt2",
+              load_format=LoadFormat.RUNAI_STREAMER,
+              enforce_eager=True)
    with pytest.raises(ValueError, match='out of vocabulary'):
        llm.generate({"prompt_token_ids": [999999]})
--- a/tests/entrypoints/openai/test_rerank.py
+++ b/tests/entrypoints/openai/test_rerank.py