Unverified Commit d5d214ac authored by Kevin H. Luu's avatar Kevin H. Luu Committed by GitHub
Browse files

[1/n][CI] Load models in CI from S3 instead of HF (#13205)



Signed-off-by: <>
Co-authored-by: default avatarEC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
parent fd84857f
...@@ -8,16 +8,21 @@ import ray ...@@ -8,16 +8,21 @@ import ray
from prometheus_client import REGISTRY from prometheus_client import REGISTRY
from vllm import EngineArgs, LLMEngine from vllm import EngineArgs, LLMEngine
from vllm.config import LoadFormat
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.metrics import RayPrometheusStatLogger from vllm.engine.metrics import RayPrometheusStatLogger
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
MODELS = [ MODELS = [
"facebook/opt-125m", "distilbert/distilgpt2",
] ]
RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("dtype", ["float"])
...@@ -141,8 +146,9 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str, ...@@ -141,8 +146,9 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
metrics_tag_content = stat_logger.labels["model_name"] metrics_tag_content = stat_logger.labels["model_name"]
if served_model_name is None or served_model_name == []: if served_model_name is None or served_model_name == []:
assert metrics_tag_content == model, ( actual_model_name = f"{MODEL_WEIGHTS_S3_BUCKET}/{model.split('/')[-1]}"
f"Metrics tag model_name is wrong! expect: {model!r}\n" assert metrics_tag_content == actual_model_name, (
f"Metrics tag model_name is wrong! expect: {actual_model_name!r}\n"
f"actual: {metrics_tag_content!r}") f"actual: {metrics_tag_content!r}")
else: else:
assert metrics_tag_content == served_model_name[0], ( assert metrics_tag_content == served_model_name[0], (
...@@ -170,7 +176,8 @@ async def test_async_engine_log_metrics_regression( ...@@ -170,7 +176,8 @@ async def test_async_engine_log_metrics_regression(
""" """
engine_args = AsyncEngineArgs(model=model, engine_args = AsyncEngineArgs(model=model,
dtype=dtype, dtype=dtype,
disable_log_stats=disable_log_stats) disable_log_stats=disable_log_stats,
load_format=RUNAI_STREAMER_LOAD_FORMAT)
async_engine = AsyncLLMEngine.from_engine_args(engine_args) async_engine = AsyncLLMEngine.from_engine_args(engine_args)
for i, prompt in enumerate(example_prompts): for i, prompt in enumerate(example_prompts):
results = async_engine.generate( results = async_engine.generate(
...@@ -199,7 +206,8 @@ def test_engine_log_metrics_regression( ...@@ -199,7 +206,8 @@ def test_engine_log_metrics_regression(
) -> None: ) -> None:
engine_args = EngineArgs(model=model, engine_args = EngineArgs(model=model,
dtype=dtype, dtype=dtype,
disable_log_stats=disable_log_stats) disable_log_stats=disable_log_stats,
load_format=RUNAI_STREAMER_LOAD_FORMAT)
engine = LLMEngine.from_engine_args(engine_args) engine = LLMEngine.from_engine_args(engine_args)
for i, prompt in enumerate(example_prompts): for i, prompt in enumerate(example_prompts):
engine.add_request( engine.add_request(
...@@ -283,7 +291,8 @@ def test_metric_spec_decode_interval( ...@@ -283,7 +291,8 @@ def test_metric_spec_decode_interval(
gpu_memory_utilization=0.4, gpu_memory_utilization=0.4,
speculative_model=model, speculative_model=model,
num_speculative_tokens=k, num_speculative_tokens=k,
enforce_eager=True) enforce_eager=True,
load_format=RUNAI_STREAMER_LOAD_FORMAT)
engine = LLMEngine.from_engine_args(engine_args) engine = LLMEngine.from_engine_args(engine_args)
......
...@@ -173,7 +173,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { ...@@ -173,7 +173,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True), trust_remote_code=True),
"QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat", "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat",
trust_remote_code=True), trust_remote_code=True),
"Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-7B-Instruct"), "Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-7B-Instruct",
extras={"2.5": "Qwen/Qwen2.5-7B-Instruct"}), # noqa: E501
"Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"), "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
"RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b", "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b",
is_available_online=False), is_available_online=False),
......
...@@ -7,6 +7,7 @@ from transformers import PretrainedConfig ...@@ -7,6 +7,7 @@ from transformers import PretrainedConfig
from vllm import LLM from vllm import LLM
from ..conftest import MODELS_ON_S3
from .registry import HF_EXAMPLE_MODELS from .registry import HF_EXAMPLE_MODELS
...@@ -42,8 +43,11 @@ def test_can_initialize(model_arch): ...@@ -42,8 +43,11 @@ def test_can_initialize(model_arch):
with patch.object(LLM.get_engine_class(), "_initialize_kv_caches", with patch.object(LLM.get_engine_class(), "_initialize_kv_caches",
_initialize_kv_caches): _initialize_kv_caches):
model_name = model_info.default
if model_name in MODELS_ON_S3:
model_name = f"s3://vllm-ci-model-weights/{model_name.split('/')[-1]}"
LLM( LLM(
model_info.default, model_name,
tokenizer=model_info.tokenizer, tokenizer=model_info.tokenizer,
tokenizer_mode=model_info.tokenizer_mode, tokenizer_mode=model_info.tokenizer_mode,
speculative_model=model_info.speculative_model, speculative_model=model_info.speculative_model,
......
...@@ -10,8 +10,8 @@ import pytest ...@@ -10,8 +10,8 @@ import pytest
from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
MODEL = "google/gemma-1.1-2b-it" MODEL = "s3://vllm-ci-model-weights/gemma-1.1-2b-it"
ENGINE_ARGS = AsyncEngineArgs(model=MODEL) ENGINE_ARGS = AsyncEngineArgs(model=MODEL, load_format="runai_streamer")
RAISED_ERROR = KeyError RAISED_ERROR = KeyError
RAISED_VALUE = "foo" RAISED_VALUE = "foo"
EXPECTED_TOKENS = 250 EXPECTED_TOKENS = 250
......
...@@ -21,8 +21,10 @@ from vllm.lora.request import LoRARequest ...@@ -21,8 +21,10 @@ from vllm.lora.request import LoRARequest
from vllm.usage.usage_lib import UsageContext from vllm.usage.usage_lib import UsageContext
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
MODEL = "google/gemma-1.1-2b-it" MODEL = "s3://vllm-ci-model-weights/gemma-1.1-2b-it"
ENGINE_ARGS = AsyncEngineArgs(model=MODEL, enforce_eager=True) ENGINE_ARGS = AsyncEngineArgs(model=MODEL,
load_format="runai_streamer",
enforce_eager=True)
RAISED_ERROR = KeyError RAISED_ERROR = KeyError
RAISED_VALUE = "foo" RAISED_VALUE = "foo"
......
...@@ -10,12 +10,14 @@ import pytest ...@@ -10,12 +10,14 @@ import pytest
from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
MODEL = "google/gemma-1.1-2b-it" MODEL = "s3://vllm-ci-model-weights/gemma-1.1-2b-it"
NUM_EXPECTED_TOKENS = 10 NUM_EXPECTED_TOKENS = 10
NUM_REQUESTS = 10000 NUM_REQUESTS = 10000
# Scenarios to test for num generated token. # Scenarios to test for num generated token.
ENGINE_ARGS = AsyncEngineArgs(model=MODEL, disable_log_requests=True) ENGINE_ARGS = AsyncEngineArgs(model=MODEL,
load_format="runai_streamer",
disable_log_requests=True)
@pytest.fixture(scope="function") @pytest.fixture(scope="function")
......
...@@ -553,7 +553,8 @@ def test_find_mm_placeholders( ...@@ -553,7 +553,8 @@ def test_find_mm_placeholders(
assert result == expected assert result == expected
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) @pytest.mark.parametrize(
"model_id", ["s3://vllm-ci-model-weights/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize( @pytest.mark.parametrize(
("limit", "num_supported", "is_valid"), ("limit", "num_supported", "is_valid"),
[(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True), [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
...@@ -592,7 +593,8 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid): ...@@ -592,7 +593,8 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
profiler.get_dummy_data(model_config.max_model_len) profiler.get_dummy_data(model_config.max_model_len)
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) @pytest.mark.parametrize(
"model_id", ["s3://vllm-ci-model-weights/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize( @pytest.mark.parametrize(
("num_images", "limit", "is_valid"), ("num_images", "limit", "is_valid"),
[(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True), [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
...@@ -661,7 +663,7 @@ class _ProcessorProxy: ...@@ -661,7 +663,7 @@ class _ProcessorProxy:
return dict(exists=exists) return dict(exists=exists)
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-7B-Instruct"]) # Dummy @pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy
# yapf: disable # yapf: disable
@pytest.mark.parametrize( @pytest.mark.parametrize(
("call_kwargs", "expected_kwargs"), ("call_kwargs", "expected_kwargs"),
......
...@@ -10,7 +10,7 @@ from vllm import SamplingParams ...@@ -10,7 +10,7 @@ from vllm import SamplingParams
# We also test with llama because it has generation_config to specify EOS # We also test with llama because it has generation_config to specify EOS
# (past regression). # (past regression).
MODELS = ["facebook/opt-125m", "meta-llama/Llama-3.2-1B-Instruct"] MODELS = ["distilbert/distilgpt2", "meta-llama/Llama-3.2-1B"]
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
......
...@@ -5,7 +5,7 @@ import torch ...@@ -5,7 +5,7 @@ import torch
from vllm import SamplingParams from vllm import SamplingParams
MODELS = ["facebook/opt-125m"] MODELS = ["distilbert/distilgpt2"]
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
......
...@@ -9,7 +9,7 @@ from vllm import SamplingParams ...@@ -9,7 +9,7 @@ from vllm import SamplingParams
from ..conftest import VllmRunner from ..conftest import VllmRunner
MODELS = ["facebook/opt-125m"] MODELS = ["distilbert/distilgpt2"]
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
......
...@@ -76,7 +76,7 @@ class TestOneTokenBadWord: ...@@ -76,7 +76,7 @@ class TestOneTokenBadWord:
class TestTwoTokenBadWord: class TestTwoTokenBadWord:
# Another model (with a different tokenizer behaviour) # Another model (with a different tokenizer behaviour)
MODEL = "openai-community/gpt2" MODEL = "distilbert/distilgpt2"
PROMPT = "How old are you? I am 10" PROMPT = "How old are you? I am 10"
TARGET_TOKEN1 = "years" TARGET_TOKEN1 = "years"
......
...@@ -4,7 +4,7 @@ import pytest ...@@ -4,7 +4,7 @@ import pytest
from vllm import SamplingParams from vllm import SamplingParams
MODELS = ["facebook/opt-125m"] MODELS = ["distilbert/distilgpt2"]
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
......
...@@ -8,14 +8,19 @@ from vllm.config import ModelConfig, PoolerConfig ...@@ -8,14 +8,19 @@ from vllm.config import ModelConfig, PoolerConfig
from vllm.model_executor.layers.pooler import PoolingType from vllm.model_executor.layers.pooler import PoolingType
from vllm.platforms import current_platform from vllm.platforms import current_platform
from .conftest import MODEL_WEIGHTS_S3_BUCKET
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_id", "expected_runner_type", "expected_task"), ("model_id", "expected_runner_type", "expected_task"),
[ [
("facebook/opt-125m", "generate", "generate"), (f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2", "generate", "generate"),
("intfloat/e5-mistral-7b-instruct", "pooling", "embed"), (f"{MODEL_WEIGHTS_S3_BUCKET}/e5-mistral-7b-instruct", "pooling",
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"), "embed"),
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "score"), (f"{MODEL_WEIGHTS_S3_BUCKET}/Qwen2.5-1.5B-apeach", "pooling",
"classify"),
(f"{MODEL_WEIGHTS_S3_BUCKET}/ms-marco-MiniLM-L-6-v2", "pooling",
"score"),
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"), ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
("openai/whisper-small", "transcription", "transcription"), ("openai/whisper-small", "transcription", "transcription"),
], ],
......
...@@ -10,6 +10,9 @@ import gc ...@@ -10,6 +10,9 @@ import gc
import torch import torch
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.config import LoadFormat
from .conftest import MODEL_WEIGHTS_S3_BUCKET
def test_duplicated_ignored_sequence_group(): def test_duplicated_ignored_sequence_group():
...@@ -18,7 +21,8 @@ def test_duplicated_ignored_sequence_group(): ...@@ -18,7 +21,8 @@ def test_duplicated_ignored_sequence_group():
sampling_params = SamplingParams(temperature=0.01, sampling_params = SamplingParams(temperature=0.01,
top_p=0.1, top_p=0.1,
max_tokens=256) max_tokens=256)
llm = LLM(model="facebook/opt-125m", llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2",
load_format=LoadFormat.RUNAI_STREAMER,
max_num_batched_tokens=4096, max_num_batched_tokens=4096,
tensor_parallel_size=1) tensor_parallel_size=1)
prompts = ["This is a short prompt", "This is a very long prompt " * 1000] prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
...@@ -31,7 +35,8 @@ def test_max_tokens_none(): ...@@ -31,7 +35,8 @@ def test_max_tokens_none():
sampling_params = SamplingParams(temperature=0.01, sampling_params = SamplingParams(temperature=0.01,
top_p=0.1, top_p=0.1,
max_tokens=None) max_tokens=None)
llm = LLM(model="facebook/opt-125m", llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2",
load_format=LoadFormat.RUNAI_STREAMER,
max_num_batched_tokens=4096, max_num_batched_tokens=4096,
tensor_parallel_size=1) tensor_parallel_size=1)
prompts = ["Just say hello!"] prompts = ["Just say hello!"]
...@@ -41,7 +46,9 @@ def test_max_tokens_none(): ...@@ -41,7 +46,9 @@ def test_max_tokens_none():
def test_gc(): def test_gc():
llm = LLM("facebook/opt-125m", enforce_eager=True) llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2",
load_format=LoadFormat.RUNAI_STREAMER,
enforce_eager=True)
del llm del llm
gc.collect() gc.collect()
......
...@@ -10,7 +10,7 @@ from vllm.worker.worker import Worker ...@@ -10,7 +10,7 @@ from vllm.worker.worker import Worker
def test_swap() -> None: def test_swap() -> None:
# Configure the engine. # Configure the engine.
engine_args = EngineArgs(model="facebook/opt-125m", engine_args = EngineArgs(model="s3://vllm-ci-model-weights/distilgpt2",
dtype="half", dtype="half",
load_format="dummy") load_format="dummy")
engine_config = engine_args.create_engine_config() engine_config = engine_args.create_engine_config()
......
...@@ -409,7 +409,8 @@ class ModelConfig: ...@@ -409,7 +409,8 @@ class ModelConfig:
if is_s3(model) or is_s3(tokenizer): if is_s3(model) or is_s3(tokenizer):
if is_s3(model): if is_s3(model):
s3_model = S3Model() s3_model = S3Model()
s3_model.pull_files(model, allow_pattern=["*config.json"]) s3_model.pull_files(
model, allow_pattern=["*.model", "*.py", "*.json"])
self.model_weights = self.model self.model_weights = self.model
self.model = s3_model.dir self.model = s3_model.dir
......
...@@ -1327,6 +1327,7 @@ class RunaiModelStreamerLoader(BaseModelLoader): ...@@ -1327,6 +1327,7 @@ class RunaiModelStreamerLoader(BaseModelLoader):
"""Prepare weights for the model. """Prepare weights for the model.
If the model is not local, it will be downloaded.""" If the model is not local, it will be downloaded."""
is_s3_path = is_s3(model_name_or_path) is_s3_path = is_s3(model_name_or_path)
is_local = os.path.isdir(model_name_or_path) is_local = os.path.isdir(model_name_or_path)
safetensors_pattern = "*.safetensors" safetensors_pattern = "*.safetensors"
...@@ -1340,7 +1341,6 @@ class RunaiModelStreamerLoader(BaseModelLoader): ...@@ -1340,7 +1341,6 @@ class RunaiModelStreamerLoader(BaseModelLoader):
revision, revision,
ignore_patterns=self.load_config.ignore_patterns, ignore_patterns=self.load_config.ignore_patterns,
)) ))
if is_s3_path: if is_s3_path:
hf_weights_files = s3_glob(path=hf_folder, hf_weights_files = s3_glob(path=hf_folder,
allow_pattern=[safetensors_pattern]) allow_pattern=[safetensors_pattern])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment