[ci] Use env var to control whether to use S3 bucket in CI (#13634)

2c5e637b · Kevin H. Luu · GitHub · 322d2a27 · 2c5e637b · 2c5e637b
Unverified Commit 2c5e637b authored Feb 22, 2025 by Kevin H. Luu Committed by GitHub Feb 22, 2025
10 changed files
--- a/tests/mq_llm_engine/test_load.py
+++ b/tests/mq_llm_engine/test_load.py
@@ -10,14 +10,12 @@ import pytest
 from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
 from vllm.engine.arg_utils import AsyncEngineArgs

-MODEL = "s3://vllm-ci-model-weights/gemma-1.1-2b-it"
+MODEL = "google/gemma-1.1-2b-it"
 NUM_EXPECTED_TOKENS = 10
 NUM_REQUESTS = 10000

 # Scenarios to test for num generated token.
-ENGINE_ARGS = AsyncEngineArgs(model=MODEL,
-                              load_format="runai_streamer",
-                              disable_log_requests=True)
+ENGINE_ARGS = AsyncEngineArgs(model=MODEL, disable_log_requests=True)


 @pytest.fixture(scope="function")

--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -553,8 +553,7 @@ def test_find_mm_placeholders(
    assert result == expected


-@pytest.mark.parametrize(
-    "model_id", ["s3://vllm-ci-model-weights/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize(
    ("limit", "num_supported", "is_valid"),
    [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
@@ -593,8 +592,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
        profiler.get_dummy_data(model_config.max_model_len)


-@pytest.mark.parametrize(
-    "model_id", ["s3://vllm-ci-model-weights/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize(
    ("num_images", "limit", "is_valid"),
    [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),

--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -16,7 +16,7 @@ from vllm.engine.llm_engine import LLMEngine
 from ..models.utils import check_outputs_equal

 MODELS = [
-    "facebook/opt-125m",
+    "distilbert/distilgpt2",
 ]

 UNSTABLE_PROMPT_SEQUENCE = [

--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -8,20 +8,14 @@ from vllm.config import ModelConfig, PoolerConfig
 from vllm.model_executor.layers.pooler import PoolingType
 from vllm.platforms import current_platform

-from .conftest import MODEL_WEIGHTS_S3_BUCKET
-

 @pytest.mark.parametrize(
    ("model_id", "expected_runner_type", "expected_task"),
    [
-        (f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2", "generate",
-         "generate"),
-        (f"{MODEL_WEIGHTS_S3_BUCKET}/intfloat/e5-mistral-7b-instruct",
-         "pooling", "embed"),
-        (f"{MODEL_WEIGHTS_S3_BUCKET}/jason9693/Qwen2.5-1.5B-apeach", "pooling",
-         "classify"),
-        (f"{MODEL_WEIGHTS_S3_BUCKET}/cross-encoder/ms-marco-MiniLM-L-6-v2",
-         "pooling", "score"),
+        ("distilbert/distilgpt2", "generate", "generate"),
+        ("intfloat/e5-mistral-7b-instruct", "pooling", "embed"),
+        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "score"),
        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
        ("openai/whisper-small", "transcription", "transcription"),
    ],

--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -10,9 +10,6 @@ import gc
 import torch

 from vllm import LLM, SamplingParams
-from vllm.config import LoadFormat
-
-from .conftest import MODEL_WEIGHTS_S3_BUCKET


 def test_duplicated_ignored_sequence_group():
@@ -21,8 +18,7 @@ def test_duplicated_ignored_sequence_group():
    sampling_params = SamplingParams(temperature=0.01,
                                     top_p=0.1,
                                     max_tokens=256)
-    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2",
-              load_format=LoadFormat.RUNAI_STREAMER,
+    llm = LLM(model="distilbert/distilgpt2",
              max_num_batched_tokens=4096,
              tensor_parallel_size=1)
    prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
@@ -35,8 +31,7 @@ def test_max_tokens_none():
    sampling_params = SamplingParams(temperature=0.01,
                                     top_p=0.1,
                                     max_tokens=None)
-    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2",
-              load_format=LoadFormat.RUNAI_STREAMER,
+    llm = LLM(model="distilbert/distilgpt2",
              max_num_batched_tokens=4096,
              tensor_parallel_size=1)
    prompts = ["Just say hello!"]
@@ -46,9 +41,7 @@ def test_max_tokens_none():


 def test_gc():
-    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2",
-              load_format=LoadFormat.RUNAI_STREAMER,
-              enforce_eager=True)
+    llm = LLM(model="distilbert/distilgpt2", enforce_eager=True)
    del llm

    gc.collect()

--- a/tests/worker/test_swap.py
+++ b/tests/worker/test_swap.py
@@ -10,7 +10,7 @@ from vllm.worker.worker import Worker

 def test_swap() -> None:
    # Configure the engine.
-    engine_args = EngineArgs(model="s3://vllm-ci-model-weights/distilgpt2",
+    engine_args = EngineArgs(model="distilbert/distilgpt2",
                             dtype="half",
                             load_format="dummy")
    engine_config = engine_args.create_engine_config()

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -22,6 +22,7 @@ from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.plugins import load_general_plugins
+from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, StoreBoolean
@@ -1141,6 +1142,14 @@ class EngineArgs:
            f", but got {self.cpu_offload_gb}")

        device_config = DeviceConfig(device=self.device)
+
+        # NOTE: This is to allow model loading from S3 in CI
+        if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3
+                and self.model in MODELS_ON_S3
+                and self.load_format == LoadFormat.AUTO):  # noqa: E501
+            self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
+            self.load_format = LoadFormat.RUNAI_STREAMER
+
        model_config = self.create_model_config()

        if (model_config.is_multimodal_model and not envs.VLLM_USE_V1

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -618,6 +618,10 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    # Port of the master node in the data parallel setting
    "VLLM_DP_MASTER_PORT":
    lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")),
+
+    # Whether to use S3 path for model loading in CI via RunAI Streamer
+    "VLLM_CI_USE_S3":
+    lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
 }

 # end-env-vars-definition

--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -1394,7 +1394,6 @@ class RunaiModelStreamerLoader(BaseModelLoader):

 def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
    """Get a model loader based on the load format."""
-
    if isinstance(load_config.load_format, type):
        return load_config.load_format(load_config)


--- a/vllm/test_utils.py
+++ b/vllm/test_utils.py
+# SPDX-License-Identifier: Apache-2.0
+MODELS_ON_S3 = [
+    "adept/fuyu-8b",
+    "ai21labs/AI21-Jamba-1.5-Mini",
+    "ai21labs/Jamba-tiny-random",
+    "ai21labs/Jamba-tiny-reward-dev",
+    "allenai/Molmo-7B-D-0924",
+    "allenai/OLMo-1B-hf",
+    "allenai/OLMoE-1B-7B-0924-Instruct",
+    "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
+    "AMead10/Llama-3.2-1B-Instruct-AWQ",
+    "ArthurZ/Ilama-3.2-1B",
+    "BAAI/bge-base-en-v1.5",
+    "BAAI/bge-multilingual-gemma2",
+    "BAAI/bge-reranker-v2-m3",
+    "bigcode/starcoder2-3b",
+    "cross-encoder/ms-marco-MiniLM-L-6-v2",
+    "cross-encoder/quora-roberta-base",
+    "deepseek-ai/deepseek-vl2-tiny",
+    "distilbert/distilgpt2",
+    "facebook/bart-base",
+    "facebook/bart-large-cnn",
+    # "fixie-ai/ultravox-v0_5-llama-3_2-1b",
+    "google/gemma-1.1-2b-it",
+    "google/gemma-2-2b-it",
+    "google/paligemma-3b-pt-224",
+    "h2oai/h2ovl-mississippi-800m",
+    "HuggingFaceM4/Idefics3-8B-Llama3",
+    "internlm/internlm2-1_8b-reward",
+    "intfloat/e5-mistral-7b-instruct",
+    "intfloat/multilingual-e5-large",
+    "jason9693/Qwen2.5-1.5B-apeach",
+    "llava-hf/llava-1.5-7b-hf",
+    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+    "llava-hf/llava-v1.6-mistral-7b-hf",
+    "llava-hf/LLaVA-NeXT-Video-7B-hf",
+    # "meta-llama/Llama-2-7b-hf",
+    "meta-llama/Llama-3.2-11B-Vision-Instruct",
+    "meta-llama/Llama-3.2-1B",
+    "meta-llama/Llama-3.2-1B-Instruct",
+    "meta-llama/Meta-Llama-3-8B",
+    "microsoft/phi-2",
+    "microsoft/Phi-3-mini-4k-instruct",
+    "microsoft/Phi-3-small-8k-instruct",
+    "microsoft/Phi-3-vision-128k-instruct",
+    "microsoft/Phi-3.5-MoE-instruct",
+    "microsoft/Phi-3.5-vision-instruct",
+    # "mistralai/Mistral-7B-Instruct-v0.1",
+    "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    "mistralai/Pixtral-12B-2409",
+    "mistral-community/Mixtral-8x22B-v0.1-AWQ",
+    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head",
+    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
+    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
+    "ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024",
+    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
+    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
+    "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
+    "nm-testing/llama2.c-stories42M-pruned2.4-compressed",
+    "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
+    "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
+    "nm-testing/Phi-3-mini-128k-instruct-FP8",
+    "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
+    "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
+    "nm-testing/tinyllama-oneshot-w4a16-channel-v2",
+    "nm-testing/tinyllama-oneshot-w4a16-group128-v2",
+    "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
+    "nm-testing/tinyllama-oneshot-w8a16-per-channel",
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
+    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
+    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
+    "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
+    "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
+    "nvidia/NVLM-D-72B",
+    "openai-community/gpt2",
+    # "openai/whisper-large-v3",
+    "openbmb/MiniCPM-o-2_6",
+    "openbmb/MiniCPM-V-2_6",
+    "OpenGVLab/InternVL2-1B",
+    "parasail-ai/GritLM-7B-vllm",
+    "Qwen/Qwen1.5-MoE-A2.7B-Chat",
+    "Qwen/Qwen2-7B-Instruct",
+    "Qwen/Qwen2-Audio-7B-Instruct",
+    "Qwen/Qwen2-VL-2B-Instruct",
+    "Qwen/Qwen2.5-1.5B-Instruct",
+    "Qwen/Qwen2.5-Math-PRM-7B",
+    "Qwen/Qwen2.5-Math-RM-72B",
+    "Qwen/Qwen2.5-VL-3B-Instruct",
+    "royokong/e5-v",
+    "sentence-transformers/all-roberta-large-v1",
+    "sentence-transformers/stsb-roberta-base-v2",
+    "shanearora/OLMo-7B-1124-hf",
+    "shuyuej/Llama-3.2-1B-Instruct-GPTQ",
+    "ssmits/Qwen2-7B-Instruct-embed-base",
+    "stabilityai/stablelm-3b-4e1t",
+    "stabilityai/stablelm-zephyr-3b",
+    "state-spaces/mamba-130m-hf",
+    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
+    "THUDM/glm-4v-9b",
+    "TIGER-Lab/Mantis-8B-siglip-llama3",
+    "TIGER-Lab/VLM2Vec-Full",
+    "tiiuae/falcon-40b",
+    "tiiuae/falcon-mamba-7b-instruct",
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "upstage/solar-pro-preview-instruct",
+]
+
+MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"