Unverified Commit d5d214ac authored by Kevin H. Luu's avatar Kevin H. Luu Committed by GitHub
Browse files

[1/n][CI] Load models in CI from S3 instead of HF (#13205)



Signed-off-by: <>
Co-authored-by: default avatarEC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
parent fd84857f
......@@ -8,16 +8,21 @@ import ray
from prometheus_client import REGISTRY
from vllm import EngineArgs, LLMEngine
from vllm.config import LoadFormat
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.metrics import RayPrometheusStatLogger
from vllm.sampling_params import SamplingParams
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
MODELS = [
"facebook/opt-125m",
"distilbert/distilgpt2",
]
RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
......@@ -141,8 +146,9 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
metrics_tag_content = stat_logger.labels["model_name"]
if served_model_name is None or served_model_name == []:
assert metrics_tag_content == model, (
f"Metrics tag model_name is wrong! expect: {model!r}\n"
actual_model_name = f"{MODEL_WEIGHTS_S3_BUCKET}/{model.split('/')[-1]}"
assert metrics_tag_content == actual_model_name, (
f"Metrics tag model_name is wrong! expect: {actual_model_name!r}\n"
f"actual: {metrics_tag_content!r}")
else:
assert metrics_tag_content == served_model_name[0], (
......@@ -170,7 +176,8 @@ async def test_async_engine_log_metrics_regression(
"""
engine_args = AsyncEngineArgs(model=model,
dtype=dtype,
disable_log_stats=disable_log_stats)
disable_log_stats=disable_log_stats,
load_format=RUNAI_STREAMER_LOAD_FORMAT)
async_engine = AsyncLLMEngine.from_engine_args(engine_args)
for i, prompt in enumerate(example_prompts):
results = async_engine.generate(
......@@ -199,7 +206,8 @@ def test_engine_log_metrics_regression(
) -> None:
engine_args = EngineArgs(model=model,
dtype=dtype,
disable_log_stats=disable_log_stats)
disable_log_stats=disable_log_stats,
load_format=RUNAI_STREAMER_LOAD_FORMAT)
engine = LLMEngine.from_engine_args(engine_args)
for i, prompt in enumerate(example_prompts):
engine.add_request(
......@@ -283,7 +291,8 @@ def test_metric_spec_decode_interval(
gpu_memory_utilization=0.4,
speculative_model=model,
num_speculative_tokens=k,
enforce_eager=True)
enforce_eager=True,
load_format=RUNAI_STREAMER_LOAD_FORMAT)
engine = LLMEngine.from_engine_args(engine_args)
......
......@@ -173,7 +173,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True),
"QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat",
trust_remote_code=True),
"Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-7B-Instruct"),
"Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-7B-Instruct",
extras={"2.5": "Qwen/Qwen2.5-7B-Instruct"}), # noqa: E501
"Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
"RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b",
is_available_online=False),
......
......@@ -7,6 +7,7 @@ from transformers import PretrainedConfig
from vllm import LLM
from ..conftest import MODELS_ON_S3
from .registry import HF_EXAMPLE_MODELS
......@@ -42,8 +43,11 @@ def test_can_initialize(model_arch):
with patch.object(LLM.get_engine_class(), "_initialize_kv_caches",
_initialize_kv_caches):
model_name = model_info.default
if model_name in MODELS_ON_S3:
model_name = f"s3://vllm-ci-model-weights/{model_name.split('/')[-1]}"
LLM(
model_info.default,
model_name,
tokenizer=model_info.tokenizer,
tokenizer_mode=model_info.tokenizer_mode,
speculative_model=model_info.speculative_model,
......
......@@ -10,8 +10,8 @@ import pytest
from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
from vllm.engine.arg_utils import AsyncEngineArgs
MODEL = "google/gemma-1.1-2b-it"
ENGINE_ARGS = AsyncEngineArgs(model=MODEL)
MODEL = "s3://vllm-ci-model-weights/gemma-1.1-2b-it"
ENGINE_ARGS = AsyncEngineArgs(model=MODEL, load_format="runai_streamer")
RAISED_ERROR = KeyError
RAISED_VALUE = "foo"
EXPECTED_TOKENS = 250
......
......@@ -21,8 +21,10 @@ from vllm.lora.request import LoRARequest
from vllm.usage.usage_lib import UsageContext
from vllm.utils import FlexibleArgumentParser
MODEL = "google/gemma-1.1-2b-it"
ENGINE_ARGS = AsyncEngineArgs(model=MODEL, enforce_eager=True)
MODEL = "s3://vllm-ci-model-weights/gemma-1.1-2b-it"
ENGINE_ARGS = AsyncEngineArgs(model=MODEL,
load_format="runai_streamer",
enforce_eager=True)
RAISED_ERROR = KeyError
RAISED_VALUE = "foo"
......
......@@ -10,12 +10,14 @@ import pytest
from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
from vllm.engine.arg_utils import AsyncEngineArgs
MODEL = "google/gemma-1.1-2b-it"
MODEL = "s3://vllm-ci-model-weights/gemma-1.1-2b-it"
NUM_EXPECTED_TOKENS = 10
NUM_REQUESTS = 10000
# Scenarios to test for num generated token.
ENGINE_ARGS = AsyncEngineArgs(model=MODEL, disable_log_requests=True)
ENGINE_ARGS = AsyncEngineArgs(model=MODEL,
load_format="runai_streamer",
disable_log_requests=True)
@pytest.fixture(scope="function")
......
......@@ -553,7 +553,8 @@ def test_find_mm_placeholders(
assert result == expected
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize(
"model_id", ["s3://vllm-ci-model-weights/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize(
("limit", "num_supported", "is_valid"),
[(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
......@@ -592,7 +593,8 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
profiler.get_dummy_data(model_config.max_model_len)
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize(
"model_id", ["s3://vllm-ci-model-weights/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize(
("num_images", "limit", "is_valid"),
[(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
......@@ -661,7 +663,7 @@ class _ProcessorProxy:
return dict(exists=exists)
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-7B-Instruct"]) # Dummy
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy
# yapf: disable
@pytest.mark.parametrize(
("call_kwargs", "expected_kwargs"),
......
......@@ -10,7 +10,7 @@ from vllm import SamplingParams
# We also test with llama because it has generation_config to specify EOS
# (past regression).
MODELS = ["facebook/opt-125m", "meta-llama/Llama-3.2-1B-Instruct"]
MODELS = ["distilbert/distilgpt2", "meta-llama/Llama-3.2-1B"]
@pytest.mark.parametrize("model", MODELS)
......
......@@ -5,7 +5,7 @@ import torch
from vllm import SamplingParams
MODELS = ["facebook/opt-125m"]
MODELS = ["distilbert/distilgpt2"]
@pytest.mark.parametrize("model", MODELS)
......
......@@ -9,7 +9,7 @@ from vllm import SamplingParams
from ..conftest import VllmRunner
MODELS = ["facebook/opt-125m"]
MODELS = ["distilbert/distilgpt2"]
@pytest.mark.parametrize("model", MODELS)
......
......@@ -76,7 +76,7 @@ class TestOneTokenBadWord:
class TestTwoTokenBadWord:
# Another model (with a different tokenizer behaviour)
MODEL = "openai-community/gpt2"
MODEL = "distilbert/distilgpt2"
PROMPT = "How old are you? I am 10"
TARGET_TOKEN1 = "years"
......
......@@ -4,7 +4,7 @@ import pytest
from vllm import SamplingParams
MODELS = ["facebook/opt-125m"]
MODELS = ["distilbert/distilgpt2"]
@pytest.mark.parametrize("model", MODELS)
......
......@@ -8,14 +8,19 @@ from vllm.config import ModelConfig, PoolerConfig
from vllm.model_executor.layers.pooler import PoolingType
from vllm.platforms import current_platform
from .conftest import MODEL_WEIGHTS_S3_BUCKET
@pytest.mark.parametrize(
("model_id", "expected_runner_type", "expected_task"),
[
("facebook/opt-125m", "generate", "generate"),
("intfloat/e5-mistral-7b-instruct", "pooling", "embed"),
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "score"),
(f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2", "generate", "generate"),
(f"{MODEL_WEIGHTS_S3_BUCKET}/e5-mistral-7b-instruct", "pooling",
"embed"),
(f"{MODEL_WEIGHTS_S3_BUCKET}/Qwen2.5-1.5B-apeach", "pooling",
"classify"),
(f"{MODEL_WEIGHTS_S3_BUCKET}/ms-marco-MiniLM-L-6-v2", "pooling",
"score"),
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
("openai/whisper-small", "transcription", "transcription"),
],
......
......@@ -10,6 +10,9 @@ import gc
import torch
from vllm import LLM, SamplingParams
from vllm.config import LoadFormat
from .conftest import MODEL_WEIGHTS_S3_BUCKET
def test_duplicated_ignored_sequence_group():
......@@ -18,7 +21,8 @@ def test_duplicated_ignored_sequence_group():
sampling_params = SamplingParams(temperature=0.01,
top_p=0.1,
max_tokens=256)
llm = LLM(model="facebook/opt-125m",
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2",
load_format=LoadFormat.RUNAI_STREAMER,
max_num_batched_tokens=4096,
tensor_parallel_size=1)
prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
......@@ -31,7 +35,8 @@ def test_max_tokens_none():
sampling_params = SamplingParams(temperature=0.01,
top_p=0.1,
max_tokens=None)
llm = LLM(model="facebook/opt-125m",
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2",
load_format=LoadFormat.RUNAI_STREAMER,
max_num_batched_tokens=4096,
tensor_parallel_size=1)
prompts = ["Just say hello!"]
......@@ -41,7 +46,9 @@ def test_max_tokens_none():
def test_gc():
llm = LLM("facebook/opt-125m", enforce_eager=True)
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2",
load_format=LoadFormat.RUNAI_STREAMER,
enforce_eager=True)
del llm
gc.collect()
......
......@@ -10,7 +10,7 @@ from vllm.worker.worker import Worker
def test_swap() -> None:
# Configure the engine.
engine_args = EngineArgs(model="facebook/opt-125m",
engine_args = EngineArgs(model="s3://vllm-ci-model-weights/distilgpt2",
dtype="half",
load_format="dummy")
engine_config = engine_args.create_engine_config()
......
......@@ -409,7 +409,8 @@ class ModelConfig:
if is_s3(model) or is_s3(tokenizer):
if is_s3(model):
s3_model = S3Model()
s3_model.pull_files(model, allow_pattern=["*config.json"])
s3_model.pull_files(
model, allow_pattern=["*.model", "*.py", "*.json"])
self.model_weights = self.model
self.model = s3_model.dir
......
......@@ -1327,6 +1327,7 @@ class RunaiModelStreamerLoader(BaseModelLoader):
"""Prepare weights for the model.
If the model is not local, it will be downloaded."""
is_s3_path = is_s3(model_name_or_path)
is_local = os.path.isdir(model_name_or_path)
safetensors_pattern = "*.safetensors"
......@@ -1340,7 +1341,6 @@ class RunaiModelStreamerLoader(BaseModelLoader):
revision,
ignore_patterns=self.load_config.ignore_patterns,
))
if is_s3_path:
hf_weights_files = s3_glob(path=hf_folder,
allow_pattern=[safetensors_pattern])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment