[2/n][ci] S3: Use full model path (#13564)

Signed-off-by: <>

[2/n][ci] S3: Use full model path (#13564)
Signed-off-by: <>
a64a8443 · Kevin H. Luu · GitHub · aa1e62d0 · a64a8443 · a64a8443
Unverified Commit a64a8443 authored Feb 20, 2025 by Kevin H. Luu Committed by GitHub Feb 20, 2025
8 changed files
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -121,7 +121,7 @@ def test_cumem_with_cudagraph():
    "model, use_v1",
    [
        # sleep mode with safetensors
-        (f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B", True),
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/meta-llama/Llama-3.2-1B", True),
        # sleep mode with pytorch checkpoint
        ("facebook/opt-125m", False),
    ])

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -746,8 +746,7 @@ class VllmRunner:
        **kwargs,
    ) -> None:
        if model_name in MODELS_ON_S3 and not load_format:
-            model_name = (f"s3://vllm-ci-model-weights/"
-                          f"{model_name.split('/')[-1]}")
+            model_name = (f"{MODEL_WEIGHTS_S3_BUCKET}/{model_name}")
            load_format = LoadFormat.RUNAI_STREAMER
        if not load_format:
            load_format = LoadFormat.AUTO

--- a/tests/engine/test_computed_prefix_blocks.py
+++ b/tests/engine/test_computed_prefix_blocks.py
@@ -10,7 +10,8 @@ from vllm.sampling_params import SamplingParams
 from ..conftest import MODEL_WEIGHTS_S3_BUCKET


-@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 @pytest.mark.parametrize("block_size", [16])
 def test_computed_prefix_blocks(model: str, block_size: int):
    # This test checks if we are able to run the engine to completion

--- a/tests/engine/test_detokenization.py
+++ b/tests/engine/test_detokenization.py
@@ -9,7 +9,8 @@ from vllm.sampling_params import SamplingParams
 from ..conftest import MODEL_WEIGHTS_S3_BUCKET


-@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_computed_prefix_blocks(model: str):
    # This test checks if the engine generates completions both with and
    # without optional detokenization, that detokenization includes text

--- a/tests/engine/test_executor.py
+++ b/tests/engine/test_executor.py
@@ -38,7 +38,8 @@ class CustomUniExecutor(UniProcExecutor):
 CustomUniExecutorAsync = CustomUniExecutor


-@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_custom_executor_type_checking(model):
    with pytest.raises(ValueError):
        engine_args = EngineArgs(model=model,
@@ -51,7 +52,8 @@ def test_custom_executor_type_checking(model):
        AsyncLLMEngine.from_engine_args(engine_args)


-@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_custom_executor(model, tmp_path):
    cwd = os.path.abspath(".")
    os.chdir(tmp_path)
@@ -75,7 +77,8 @@ def test_custom_executor(model, tmp_path):
        os.chdir(cwd)


-@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_custom_executor_async(model, tmp_path):
    cwd = os.path.abspath(".")
    os.chdir(tmp_path)
@@ -103,7 +106,8 @@ def test_custom_executor_async(model, tmp_path):
        os.chdir(cwd)


-@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_respect_ray(model):
    # even for TP=1 and PP=1,
    # if users specify ray, we should use ray.

--- a/tests/engine/test_skip_tokenizer_init.py
+++ b/tests/engine/test_skip_tokenizer_init.py
@@ -9,7 +9,8 @@ from vllm.sampling_params import SamplingParams
 from ..conftest import MODEL_WEIGHTS_S3_BUCKET


-@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_skip_tokenizer_initialization(model: str):
    # This test checks if the flag skip_tokenizer_init skips the initialization
    # of tokenizer and detokenizer. The generated output is expected to contain

--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -14,13 +14,14 @@ from .conftest import MODEL_WEIGHTS_S3_BUCKET
 @pytest.mark.parametrize(
    ("model_id", "expected_runner_type", "expected_task"),
    [
-        (f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2", "generate", "generate"),
-        (f"{MODEL_WEIGHTS_S3_BUCKET}/e5-mistral-7b-instruct", "pooling",
-         "embed"),
-        (f"{MODEL_WEIGHTS_S3_BUCKET}/Qwen2.5-1.5B-apeach", "pooling",
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2", "generate",
+         "generate"),
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/intfloat/e5-mistral-7b-instruct",
+         "pooling", "embed"),
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/jason9693/Qwen2.5-1.5B-apeach", "pooling",
         "classify"),
-        (f"{MODEL_WEIGHTS_S3_BUCKET}/ms-marco-MiniLM-L-6-v2", "pooling",
-         "score"),
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/cross-encoder/ms-marco-MiniLM-L-6-v2",
+         "pooling", "score"),
        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
        ("openai/whisper-small", "transcription", "transcription"),
    ],

--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -21,7 +21,7 @@ def test_duplicated_ignored_sequence_group():
    sampling_params = SamplingParams(temperature=0.01,
                                     top_p=0.1,
                                     max_tokens=256)
-    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2",
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2",
              load_format=LoadFormat.RUNAI_STREAMER,
              max_num_batched_tokens=4096,
              tensor_parallel_size=1)
@@ -35,7 +35,7 @@ def test_max_tokens_none():
    sampling_params = SamplingParams(temperature=0.01,
                                     top_p=0.1,
                                     max_tokens=None)
-    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2",
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2",
              load_format=LoadFormat.RUNAI_STREAMER,
              max_num_batched_tokens=4096,
              tensor_parallel_size=1)
@@ -46,7 +46,7 @@ def test_max_tokens_none():


 def test_gc():
-    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2",
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2",
              load_format=LoadFormat.RUNAI_STREAMER,
              enforce_eager=True)
    del llm