Commit 469e903b authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.2' into v0.8.2-dev

parents 389ebcf7 25f560a6
...@@ -7,30 +7,60 @@ import pytest ...@@ -7,30 +7,60 @@ import pytest
from vllm.distributed.utils import get_pp_indices from vllm.distributed.utils import get_pp_indices
def test_custom_layer_partition(): def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch):
def _verify(partition_str, num_layers, pp_size, goldens): with monkeypatch.context() as m:
bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
os.environ["VLLM_PP_LAYER_PARTITION"] = partition_str def _verify(partition_str, num_layers, pp_size, goldens):
for pp_rank, golden in enumerate(goldens): bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
assert get_pp_indices(num_layers, pp_rank, pp_size) == golden m.setenv("VLLM_PP_LAYER_PARTITION", partition_str)
if bak is not None: for pp_rank, golden in enumerate(goldens):
os.environ["VLLM_PP_LAYER_PARTITION"] = bak assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
if bak is not None:
# Even partition m.setenv("VLLM_PP_LAYER_PARTITION", bak)
_verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
# Balanced partition # Even partition
_verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)]) _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
# Put reminder somewhere # Balanced partition
_verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)]) _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
# Invalid partition strings # Put reminder somewhere
with pytest.raises(ValueError): _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
_verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) # Invalid partition strings
with pytest.raises(ValueError): with pytest.raises(ValueError):
_verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
# Wrong number of partitions with pytest.raises(ValueError):
with pytest.raises(ValueError): _verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
_verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) # Wrong number of partitions
# Wrong number of layers with pytest.raises(ValueError):
with pytest.raises(ValueError): _verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
_verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) # Wrong number of layers
with pytest.raises(ValueError):
_verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
@pytest.mark.parametrize(
"num_hidden_layers,pp_size,pp_rank,indices",
[
# pp_size 2
(2, 2, 0, (0, 1)),
(2, 2, 1, (1, 2)),
(3, 2, 0, (0, 2)),
(3, 2, 1, (2, 3)),
# pp_size 3
(3, 3, 0, (0, 1)),
(3, 3, 1, (1, 2)),
(3, 3, 2, (2, 3)),
(4, 3, 0, (0, 1)),
(4, 3, 1, (1, 3)),
(4, 3, 2, (3, 4)),
(5, 3, 0, (0, 2)),
(5, 3, 1, (2, 4)),
(5, 3, 2, (4, 5)),
])
def test_uneven_auto_partition(
num_hidden_layers: int,
pp_size: int,
pp_rank: int,
indices: tuple[int, int],
):
assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size)
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import os from typing import TYPE_CHECKING
import os
import pytest import pytest
from ..utils import compare_two_settings, fork_new_process_for_each_test, models_path_prefix from ..utils import compare_two_settings, create_new_process_for_each_test, models_path_prefix
if TYPE_CHECKING:
from typing_extensions import LiteralString
@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [ @pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
...@@ -14,19 +19,25 @@ from ..utils import compare_two_settings, fork_new_process_for_each_test, models ...@@ -14,19 +19,25 @@ from ..utils import compare_two_settings, fork_new_process_for_each_test, models
"FLASH_ATTN", "FLASH_ATTN",
# "FLASHINFER", # "FLASHINFER",
]) ])
@fork_new_process_for_each_test @create_new_process_for_each_test()
def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND): def test_pp_cudagraph(
cudagraph_args = [ monkeypatch: pytest.MonkeyPatch,
# use half precision for speed and memory savings in CI environment PP_SIZE: int,
"--dtype", MODEL_NAME: str,
"float16", ATTN_BACKEND: LiteralString,
"--pipeline-parallel-size", ):
str(PP_SIZE), with monkeypatch.context() as m:
"--distributed-executor-backend", cudagraph_args = [
"mp", # use half precision for speed and memory savings in CI environment
] "--dtype",
os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND "float16",
"--pipeline-parallel-size",
eager_args = cudagraph_args + ["--enforce-eager"] str(PP_SIZE),
"--distributed-executor-backend",
compare_two_settings(MODEL_NAME, eager_args, cudagraph_args) "mp",
]
m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND)
eager_args = cudagraph_args + ["--enforce-eager"]
compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
import multiprocessing import multiprocessing
import os import os
from typing import Dict, List
import pytest import pytest
import torch import torch
...@@ -20,9 +19,9 @@ from vllm.utils import update_environment_variables ...@@ -20,9 +19,9 @@ from vllm.utils import update_environment_variables
def distributed_run(fn, world_size): def distributed_run(fn, world_size):
number_of_processes = world_size number_of_processes = world_size
processes: List[multiprocessing.Process] = [] processes: list[multiprocessing.Process] = []
for i in range(number_of_processes): for i in range(number_of_processes):
env: Dict[str, str] = {} env: dict[str, str] = {}
env['RANK'] = str(i) env['RANK'] = str(i)
env['LOCAL_RANK'] = str(i) env['LOCAL_RANK'] = str(i)
env['WORLD_SIZE'] = str(number_of_processes) env['WORLD_SIZE'] = str(number_of_processes)
......
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
import multiprocessing import multiprocessing
import random import random
import time import time
from typing import List
import numpy as np import numpy as np
import torch.distributed as dist import torch.distributed as dist
...@@ -13,7 +12,7 @@ from vllm.distributed.utils import StatelessProcessGroup ...@@ -13,7 +12,7 @@ from vllm.distributed.utils import StatelessProcessGroup
from vllm.utils import get_ip, get_open_port, update_environment_variables from vllm.utils import get_ip, get_open_port, update_environment_variables
def get_arrays(n: int, seed: int = 0) -> List[np.ndarray]: def get_arrays(n: int, seed: int = 0) -> list[np.ndarray]:
np.random.seed(seed) np.random.seed(seed)
sizes = np.random.randint(1, 10_000, n) sizes = np.random.randint(1, 10_000, n)
# on average, each array will have 5k elements # on average, each array will have 5k elements
......
...@@ -9,6 +9,8 @@ import torch.distributed as dist ...@@ -9,6 +9,8 @@ import torch.distributed as dist
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.distributed.parallel_state import get_world_group from vllm.distributed.parallel_state import get_world_group
dist.init_process_group(backend="gloo")
# Create prompts # Create prompts
prompts = [ prompts = [
"Hello, my name is", "Hello, my name is",
...@@ -25,7 +27,8 @@ llm = LLM(model="facebook/opt-125m", ...@@ -25,7 +27,8 @@ llm = LLM(model="facebook/opt-125m",
tensor_parallel_size=2, tensor_parallel_size=2,
distributed_executor_backend="external_launcher", distributed_executor_backend="external_launcher",
gpu_memory_utilization=random.uniform(0.7, 0.9), gpu_memory_utilization=random.uniform(0.7, 0.9),
swap_space=random.randint(1, 4)) swap_space=random.randint(1, 4),
seed=0)
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
...@@ -48,6 +51,12 @@ test_consistent_across_ranks( ...@@ -48,6 +51,12 @@ test_consistent_across_ranks(
test_consistent_across_ranks( test_consistent_across_ranks(
llm.llm_engine.vllm_config.cache_config.num_gpu_blocks) llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
# make sure we can access the model parameters from the calling process
# of the `LLM` instance.
params = list(llm.llm_engine.model_executor.driver_worker.worker.model_runner.
model.parameters())
test_consistent_across_ranks(len(params))
# all ranks should have the same outputs # all ranks should have the same outputs
for output in outputs: for output in outputs:
prompt = output.prompt prompt = output.prompt
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
Run `pytest tests/encoder_decoder/test_e2e_correctness.py`. Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
""" """
from typing import List, Optional, Tuple from typing import Optional
import pytest import pytest
import os import os
...@@ -17,7 +17,6 @@ from vllm.sequence import SampleLogprobs ...@@ -17,7 +17,6 @@ from vllm.sequence import SampleLogprobs
from ..conftest import DecoderPromptType from ..conftest import DecoderPromptType
from ..models.utils import check_logprobs_close from ..models.utils import check_logprobs_close
from ..utils import models_path_prefix from ..utils import models_path_prefix
from vllm.utils import is_hip
from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP
LIST_ENC_DEC_SUPPORTED_BACKENDS = [ LIST_ENC_DEC_SUPPORTED_BACKENDS = [
...@@ -25,8 +24,17 @@ LIST_ENC_DEC_SUPPORTED_BACKENDS = [ ...@@ -25,8 +24,17 @@ LIST_ENC_DEC_SUPPORTED_BACKENDS = [
] ]
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
def vllm_to_hf_output( def vllm_to_hf_output(
vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]], vllm_output: tuple[list[int], str, Optional[SampleLogprobs]],
decoder_prompt_type: DecoderPromptType, decoder_prompt_type: DecoderPromptType,
): ):
"""Sanitize vllm output to be comparable with hf output.""" """Sanitize vllm output to be comparable with hf output."""
......
# SPDX-License-Identifier: Apache-2.0
import pytest
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
...@@ -2,16 +2,12 @@ ...@@ -2,16 +2,12 @@
import pytest import pytest
from vllm.config import LoadFormat
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.engine.llm_engine import LLMEngine from vllm.engine.llm_engine import LLMEngine
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
@pytest.mark.parametrize("model",
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
@pytest.mark.parametrize("block_size", [16]) @pytest.mark.parametrize("block_size", [16])
def test_computed_prefix_blocks(model: str, block_size: int): def test_computed_prefix_blocks(model: str, block_size: int):
# This test checks if we are able to run the engine to completion # This test checks if we are able to run the engine to completion
...@@ -28,7 +24,6 @@ def test_computed_prefix_blocks(model: str, block_size: int): ...@@ -28,7 +24,6 @@ def test_computed_prefix_blocks(model: str, block_size: int):
"decoration.") "decoration.")
engine_args = EngineArgs(model=model, engine_args = EngineArgs(model=model,
load_format=LoadFormat.RUNAI_STREAMER,
block_size=block_size, block_size=block_size,
enable_prefix_caching=True) enable_prefix_caching=True)
......
...@@ -2,11 +2,10 @@ ...@@ -2,11 +2,10 @@
import asyncio import asyncio
import os import os
from typing import Any, Callable, Dict, List, Optional, Tuple, Union from typing import Any, Callable, Optional, Union
import pytest import pytest
from vllm.config import LoadFormat
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.llm_engine import LLMEngine from vllm.engine.llm_engine import LLMEngine
...@@ -15,10 +14,6 @@ from vllm.sampling_params import SamplingParams ...@@ -15,10 +14,6 @@ from vllm.sampling_params import SamplingParams
import os import os
from ..utils import models_path_prefix from ..utils import models_path_prefix
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
class Mock: class Mock:
... ...
...@@ -29,8 +24,8 @@ class CustomUniExecutor(UniProcExecutor): ...@@ -29,8 +24,8 @@ class CustomUniExecutor(UniProcExecutor):
def collective_rpc(self, def collective_rpc(self,
method: Union[str, Callable], method: Union[str, Callable],
timeout: Optional[float] = None, timeout: Optional[float] = None,
args: Tuple = (), args: tuple = (),
kwargs: Optional[Dict] = None) -> List[Any]: kwargs: Optional[dict] = None) -> list[Any]:
# Drop marker to show that this was ran # Drop marker to show that this was ran
with open(".marker", "w"): with open(".marker", "w"):
... ...
...@@ -39,12 +34,10 @@ class CustomUniExecutor(UniProcExecutor): ...@@ -39,12 +34,10 @@ class CustomUniExecutor(UniProcExecutor):
CustomUniExecutorAsync = CustomUniExecutor CustomUniExecutorAsync = CustomUniExecutor
@pytest.mark.parametrize("model", @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
def test_custom_executor_type_checking(model): def test_custom_executor_type_checking(model):
with pytest.raises(ValueError): with pytest.raises(ValueError):
engine_args = EngineArgs(model=model, engine_args = EngineArgs(model=model,
load_format=RUNAI_STREAMER_LOAD_FORMAT,
distributed_executor_backend=Mock) distributed_executor_backend=Mock)
LLMEngine.from_engine_args(engine_args) LLMEngine.from_engine_args(engine_args)
with pytest.raises(ValueError): with pytest.raises(ValueError):
...@@ -53,8 +46,7 @@ def test_custom_executor_type_checking(model): ...@@ -53,8 +46,7 @@ def test_custom_executor_type_checking(model):
AsyncLLMEngine.from_engine_args(engine_args) AsyncLLMEngine.from_engine_args(engine_args)
@pytest.mark.parametrize("model", @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
def test_custom_executor(model, tmp_path): def test_custom_executor(model, tmp_path):
cwd = os.path.abspath(".") cwd = os.path.abspath(".")
os.chdir(tmp_path) os.chdir(tmp_path)
...@@ -63,7 +55,6 @@ def test_custom_executor(model, tmp_path): ...@@ -63,7 +55,6 @@ def test_custom_executor(model, tmp_path):
engine_args = EngineArgs( engine_args = EngineArgs(
model=model, model=model,
load_format=RUNAI_STREAMER_LOAD_FORMAT,
distributed_executor_backend=CustomUniExecutor, distributed_executor_backend=CustomUniExecutor,
enforce_eager=True, # reduce test time enforce_eager=True, # reduce test time
) )
...@@ -78,8 +69,7 @@ def test_custom_executor(model, tmp_path): ...@@ -78,8 +69,7 @@ def test_custom_executor(model, tmp_path):
os.chdir(cwd) os.chdir(cwd)
@pytest.mark.parametrize("model", @pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
def test_custom_executor_async(model, tmp_path): def test_custom_executor_async(model, tmp_path):
cwd = os.path.abspath(".") cwd = os.path.abspath(".")
os.chdir(tmp_path) os.chdir(tmp_path)
...@@ -88,7 +78,6 @@ def test_custom_executor_async(model, tmp_path): ...@@ -88,7 +78,6 @@ def test_custom_executor_async(model, tmp_path):
engine_args = AsyncEngineArgs( engine_args = AsyncEngineArgs(
model=model, model=model,
load_format=RUNAI_STREAMER_LOAD_FORMAT,
distributed_executor_backend=CustomUniExecutorAsync, distributed_executor_backend=CustomUniExecutorAsync,
enforce_eager=True, # reduce test time enforce_eager=True, # reduce test time
) )
...@@ -107,8 +96,7 @@ def test_custom_executor_async(model, tmp_path): ...@@ -107,8 +96,7 @@ def test_custom_executor_async(model, tmp_path):
os.chdir(cwd) os.chdir(cwd)
@pytest.mark.parametrize("model", @pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
def test_respect_ray(model): def test_respect_ray(model):
# even for TP=1 and PP=1, # even for TP=1 and PP=1,
# if users specify ray, we should use ray. # if users specify ray, we should use ray.
...@@ -117,7 +105,6 @@ def test_respect_ray(model): ...@@ -117,7 +105,6 @@ def test_respect_ray(model):
engine_args = EngineArgs( engine_args = EngineArgs(
model=model, model=model,
distributed_executor_backend="ray", distributed_executor_backend="ray",
load_format=RUNAI_STREAMER_LOAD_FORMAT,
enforce_eager=True, # reduce test time enforce_eager=True, # reduce test time
) )
engine = LLMEngine.from_engine_args(engine_args) engine = LLMEngine.from_engine_args(engine_args)
......
...@@ -15,7 +15,7 @@ from vllm.sequence import (CompletionSequenceGroupOutput, Logprob, ...@@ -15,7 +15,7 @@ from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
from vllm.transformers_utils.detokenizer import Detokenizer from vllm.transformers_utils.detokenizer import Detokenizer
from vllm.utils import Counter from vllm.utils import Counter
from ...core.utils import create_seq_group from ..core.utils import create_seq_group
@pytest.mark.parametrize("seq_output_len", [128]) @pytest.mark.parametrize("seq_output_len", [128])
......
...@@ -4,7 +4,7 @@ import asyncio ...@@ -4,7 +4,7 @@ import asyncio
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from functools import partial from functools import partial
from time import sleep from time import sleep
from typing import Any, List, Tuple from typing import Any
import pytest import pytest
...@@ -17,7 +17,7 @@ from vllm.worker.worker_base import WorkerWrapperBase ...@@ -17,7 +17,7 @@ from vllm.worker.worker_base import WorkerWrapperBase
class DummyWorkerWrapper(WorkerWrapperBase): class DummyWorkerWrapper(WorkerWrapperBase):
"""Dummy version of vllm.worker.worker.Worker""" """Dummy version of vllm.worker.worker.Worker"""
def worker_method(self, worker_input: Any) -> Tuple[int, Any]: def worker_method(self, worker_input: Any) -> tuple[int, Any]:
sleep(0.05) sleep(0.05)
if isinstance(worker_input, Exception): if isinstance(worker_input, Exception):
...@@ -27,7 +27,7 @@ class DummyWorkerWrapper(WorkerWrapperBase): ...@@ -27,7 +27,7 @@ class DummyWorkerWrapper(WorkerWrapperBase):
return self.rpc_rank, input return self.rpc_rank, input
def _start_workers() -> Tuple[List[ProcessWorkerWrapper], WorkerMonitor]: def _start_workers() -> tuple[list[ProcessWorkerWrapper], WorkerMonitor]:
result_handler = ResultHandler() result_handler = ResultHandler()
vllm_config = VllmConfig() vllm_config = VllmConfig()
workers = [ workers = [
......
...@@ -2,22 +2,19 @@ ...@@ -2,22 +2,19 @@
import pytest import pytest
from vllm.config import LoadFormat
from vllm.entrypoints.llm import LLM from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
@pytest.mark.parametrize("model",
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
def test_skip_tokenizer_initialization(model: str): def test_skip_tokenizer_initialization(model: str):
# This test checks if the flag skip_tokenizer_init skips the initialization # This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain # of tokenizer and detokenizer. The generated output is expected to contain
# token ids. # token ids.
llm = LLM(model=model, llm = LLM(
skip_tokenizer_init=True, model=model,
load_format=LoadFormat.RUNAI_STREAMER) skip_tokenizer_init=True,
)
sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True) sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
with pytest.raises(ValueError, match="cannot pass text prompts when"): with pytest.raises(ValueError, match="cannot pass text prompts when"):
......
...@@ -44,10 +44,14 @@ def run_test(more_args=None): ...@@ -44,10 +44,14 @@ def run_test(more_args=None):
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}" ), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
# TODO: [AlexM] Fix it with new CI/CD tests
TPU_TP_TEST_STR = "" #"tensor_parallel_size=4"
@pytest.mark.skipif(not current_platform.is_cuda() @pytest.mark.skipif(not current_platform.is_cuda()
and not current_platform.is_tpu(), and not current_platform.is_tpu(),
reason="V1 is currently only supported on CUDA and TPU") reason="V1 is currently only supported on CUDA and TPU")
def test_lm_eval_accuracy_v1_engine(monkeypatch): def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
"""Run with the V1 Engine.""" """Run with the V1 Engine."""
with monkeypatch.context() as m: with monkeypatch.context() as m:
...@@ -58,10 +62,14 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch): ...@@ -58,10 +62,14 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
# Limit compilation time for TPU V1 # Limit compilation time for TPU V1
more_args = "max_num_seqs=64" more_args = "max_num_seqs=64"
# Add TP test (if provided)
if TPU_TP_TEST_STR:
more_args += ",{}".format(TPU_TP_TEST_STR)
run_test(more_args) run_test(more_args)
def test_lm_eval_accuracy_v0_engine(monkeypatch): def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
"""Run with the V0 Engine.""" """Run with the V0 Engine."""
with monkeypatch.context() as m: with monkeypatch.context() as m:
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from typing import List
import os import os
import pytest import pytest
from vllm import LLM from vllm import LLM
from vllm.config import LoadFormat
from ...conftest import MODEL_WEIGHTS_S3_BUCKET
from ..openai.test_vision import TEST_IMAGE_URLS from ..openai.test_vision import TEST_IMAGE_URLS
from ...utils import models_path_prefix from ...utils import models_path_prefix
RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
def test_chat(): def test_chat():
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct", llm = LLM(model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"))
load_format=RUNAI_STREAMER_LOAD_FORMAT)
prompt1 = "Explain the concept of entropy." prompt1 = "Explain the concept of entropy."
messages = [ messages = [
...@@ -35,8 +28,7 @@ def test_chat(): ...@@ -35,8 +28,7 @@ def test_chat():
def test_multi_chat(): def test_multi_chat():
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct", llm = LLM(model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"))
load_format=RUNAI_STREAMER_LOAD_FORMAT)
prompt1 = "Explain the concept of entropy." prompt1 = "Explain the concept of entropy."
prompt2 = "Explain what among us is." prompt2 = "Explain what among us is."
...@@ -71,11 +63,9 @@ def test_multi_chat(): ...@@ -71,11 +63,9 @@ def test_multi_chat():
@pytest.mark.parametrize("image_urls", @pytest.mark.parametrize("image_urls",
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]]) [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
def test_chat_multi_image(image_urls: List[str]): def test_chat_multi_image(image_urls: list[str]):
llm = LLM( llm = LLM(
model=f"{MODEL_WEIGHTS_S3_BUCKET}/Phi-3.5-vision-instruct", model="microsoft/Phi-3.5-vision-instruct",
load_format=RUNAI_STREAMER_LOAD_FORMAT,
dtype="bfloat16",
max_model_len=4096, max_model_len=4096,
max_num_seqs=5, max_num_seqs=5,
enforce_eager=True, enforce_eager=True,
......
...@@ -4,12 +4,12 @@ import pytest ...@@ -4,12 +4,12 @@ import pytest
from vllm import LLM from vllm import LLM
from ...utils import fork_new_process_for_each_test from ...utils import create_new_process_for_each_test
@pytest.mark.parametrize("tp_size", [1, 2]) @pytest.mark.parametrize("tp_size", [1, 2])
@pytest.mark.parametrize("backend", ["mp", "ray"]) @pytest.mark.parametrize("backend", ["mp", "ray"])
@fork_new_process_for_each_test @create_new_process_for_each_test()
def test_collective_rpc(tp_size, backend): def test_collective_rpc(tp_size, backend):
if tp_size == 1 and backend == "ray": if tp_size == 1 and backend == "ray":
pytest.skip("Skip duplicate test case") pytest.skip("Skip duplicate test case")
...@@ -21,18 +21,9 @@ def test_collective_rpc(tp_size, backend): ...@@ -21,18 +21,9 @@ def test_collective_rpc(tp_size, backend):
def echo_rank(self): def echo_rank(self):
return self.rank return self.rank
from vllm.worker.worker import Worker llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
class MyWorker(Worker):
def echo_rank(self):
return self.rank
llm = LLM(model="s3://vllm-ci-model-weights/Llama-3.2-1B-Instruct",
enforce_eager=True, enforce_eager=True,
load_format="dummy", load_format="dummy",
tensor_parallel_size=tp_size, tensor_parallel_size=tp_size,
distributed_executor_backend=backend, distributed_executor_backend=backend)
worker_cls=MyWorker) assert llm.collective_rpc(echo_rank) == list(range(tp_size))
for method in ["echo_rank", echo_rank]:
assert llm.collective_rpc(method) == list(range(tp_size))
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import weakref import weakref
from typing import List
import pytest import pytest
import os import os
from vllm import LLM, PoolingParams, PoolingRequestOutput from vllm import LLM, PoolingParams, PoolingRequestOutput
from vllm.config import LoadFormat
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
from ...utils import models_path_prefix from ...utils import models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "e5-mistral-7b-instruct") MODEL_NAME = os.path.join(models_path_prefix, "intfloat/multilingual-e5-small")
PROMPTS = [ PROMPTS = [
"Hello, my name is", "Hello, my name is",
...@@ -35,11 +33,11 @@ def llm(): ...@@ -35,11 +33,11 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to # pytest caches the fixture so we use weakref.proxy to
# enable garbage collection # enable garbage collection
llm = LLM(model=MODEL_NAME, llm = LLM(model=MODEL_NAME,
load_format=LoadFormat.RUNAI_STREAMER,
max_num_batched_tokens=32768, max_num_batched_tokens=32768,
tensor_parallel_size=1, tensor_parallel_size=1,
gpu_memory_utilization=0.75, gpu_memory_utilization=0.75,
enforce_eager=True) enforce_eager=True,
seed=0)
with llm.deprecate_legacy_api(): with llm.deprecate_legacy_api():
yield weakref.proxy(llm) yield weakref.proxy(llm)
...@@ -49,8 +47,8 @@ def llm(): ...@@ -49,8 +47,8 @@ def llm():
cleanup_dist_env_and_memory() cleanup_dist_env_and_memory()
def assert_outputs_equal(o1: List[PoolingRequestOutput], def assert_outputs_equal(o1: list[PoolingRequestOutput],
o2: List[PoolingRequestOutput]): o2: list[PoolingRequestOutput]):
assert [o.outputs for o in o1] == [o.outputs for o in o2] assert [o.outputs for o in o1] == [o.outputs for o in o2]
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import weakref import weakref
from typing import List
import os import os
import pytest import pytest
from vllm import LLM, RequestOutput, SamplingParams from vllm import LLM, RequestOutput, SamplingParams
from vllm.config import LoadFormat
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
from ...utils import models_path_prefix from ...utils import models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "distilgpt2") MODEL_NAME = os.path.join(models_path_prefix, "distilbert/distilgpt2")
PROMPTS = [ PROMPTS = [
"Hello, my name is", "Hello, my name is",
...@@ -33,7 +31,6 @@ def llm(): ...@@ -33,7 +31,6 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to # pytest caches the fixture so we use weakref.proxy to
# enable garbage collection # enable garbage collection
llm = LLM(model=MODEL_NAME, llm = LLM(model=MODEL_NAME,
load_format=LoadFormat.RUNAI_STREAMER,
max_num_batched_tokens=4096, max_num_batched_tokens=4096,
tensor_parallel_size=1, tensor_parallel_size=1,
gpu_memory_utilization=0.10, gpu_memory_utilization=0.10,
...@@ -47,7 +44,7 @@ def llm(): ...@@ -47,7 +44,7 @@ def llm():
cleanup_dist_env_and_memory() cleanup_dist_env_and_memory()
def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]): def assert_outputs_equal(o1: list[RequestOutput], o2: list[RequestOutput]):
assert [o.outputs for o in o1] == [o.outputs for o in o2] assert [o.outputs for o in o1] == [o.outputs for o in o2]
......
...@@ -8,12 +8,11 @@ import os ...@@ -8,12 +8,11 @@ import os
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from vllm import LLM from vllm import LLM
from vllm.config import LoadFormat
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from ...utils import models_path_prefix from ...utils import models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "zephyr-7b-beta") MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
PROMPTS = [ PROMPTS = [
"Hello, my name is", "Hello, my name is",
...@@ -30,7 +29,6 @@ def llm(): ...@@ -30,7 +29,6 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to # pytest caches the fixture so we use weakref.proxy to
# enable garbage collection # enable garbage collection
llm = LLM(model=MODEL_NAME, llm = LLM(model=MODEL_NAME,
load_format=LoadFormat.RUNAI_STREAMER,
tensor_parallel_size=1, tensor_parallel_size=1,
max_model_len=8192, max_model_len=8192,
enable_lora=True, enable_lora=True,
......
...@@ -7,8 +7,8 @@ import weakref ...@@ -7,8 +7,8 @@ import weakref
import jsonschema import jsonschema
import pytest import pytest
import os import os
from pydantic import BaseModel
from vllm.config import LoadFormat
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
from vllm.entrypoints.llm import LLM from vllm.entrypoints.llm import LLM
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
...@@ -17,16 +17,16 @@ from vllm.sampling_params import GuidedDecodingParams, SamplingParams ...@@ -17,16 +17,16 @@ from vllm.sampling_params import GuidedDecodingParams, SamplingParams
from ...utils import models_path_prefix from ...utils import models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "Qwen2.5-1.5B-Instruct") MODEL_NAME = os.path.join(models_path_prefix, "Qwen2.5-1.5B-Instruct")
GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"] GUIDED_DECODING_BACKENDS = [
"outlines", "lm-format-enforcer", "xgrammar", "guidance"
]
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def llm(): def llm():
# pytest caches the fixture so we use weakref.proxy to # pytest caches the fixture so we use weakref.proxy to
# enable garbage collection # enable garbage collection
llm = LLM(model=MODEL_NAME, llm = LLM(model=MODEL_NAME, max_model_len=1024, seed=0)
load_format=LoadFormat.RUNAI_STREAMER,
max_model_len=1024)
with llm.deprecate_legacy_api(): with llm.deprecate_legacy_api():
yield weakref.proxy(llm) yield weakref.proxy(llm)
...@@ -283,6 +283,22 @@ def test_validation_against_both_guided_decoding_options(sample_regex, llm): ...@@ -283,6 +283,22 @@ def test_validation_against_both_guided_decoding_options(sample_regex, llm):
guided_options_request=dict(guided_regex=sample_regex)) guided_options_request=dict(guided_regex=sample_regex))
@pytest.mark.skip_global_cleanup
def test_disable_guided_decoding_fallback(sample_regex, llm):
sampling_params = SamplingParams(temperature=0.8,
top_p=0.95,
guided_decoding=GuidedDecodingParams(
regex=sample_regex,
backend="xgrammar:no-fallback"))
with pytest.raises(
ValueError,
match="xgrammar does not support regex guided decoding"):
llm.generate(prompts="This should fail",
sampling_params=sampling_params,
use_tqdm=True)
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
def test_guided_json_object(llm, guided_decoding_backend: str): def test_guided_json_object(llm, guided_decoding_backend: str):
...@@ -312,3 +328,56 @@ def test_guided_json_object(llm, guided_decoding_backend: str): ...@@ -312,3 +328,56 @@ def test_guided_json_object(llm, guided_decoding_backend: str):
# Parse to verify it is valid JSON # Parse to verify it is valid JSON
parsed_json = json.loads(generated_text) parsed_json = json.loads(generated_text)
assert isinstance(parsed_json, dict) assert isinstance(parsed_json, dict)
@pytest.mark.skip_global_cleanup
def test_json_with_any_whitespace_disabled(llm):
class ResponseSchema(BaseModel):
clarifying_question: str
cost_per_serving: str
calories: str
type_dish_ids: str
type_meal_ids: str
product_ids: list[str]
exclude_product_ids: list[str]
allergen_ids: list[str]
total_cooking_time: str
kitchen_ids: str
holiday_ids: str
# Note: Without this setting, the response is sometimes full of `\n`
# for some models. This option prevents that.
guided_decoding_backend = 'xgrammar:disable-any-whitespace'
schema = ResponseSchema.model_json_schema()
guided_params = GuidedDecodingParams(json=schema,
backend=\
guided_decoding_backend)
sampling_params = SamplingParams(max_tokens=2000,
frequency_penalty=0,
presence_penalty=-1.1,
repetition_penalty=1.3,
guided_decoding=guided_params)
prompt = ("<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You"
"are a helpful assistant.<|im_end|>\n<|im_start|>user\nI want a "
"quick launch fast with $10.<|im_end|>\n<|im_start|>assistant\n")
outputs = llm.generate(prompts=prompt,
sampling_params=sampling_params,
use_tqdm=True)
assert outputs is not None
for output in outputs:
assert output is not None
assert isinstance(output, RequestOutput)
generated_text = output.outputs[0].text
assert generated_text is not None
assert "\n" not in generated_text
# Parse to verify it is valid JSON
parsed_json = json.loads(generated_text)
assert isinstance(parsed_json, dict)
jsonschema.validate(instance=parsed_json, schema=schema)
...@@ -4,14 +4,22 @@ import sys ...@@ -4,14 +4,22 @@ import sys
import os import os
from contextlib import nullcontext from contextlib import nullcontext
import pytest
from vllm_test_utils import BlameResult, blame from vllm_test_utils import BlameResult, blame
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.config import LoadFormat
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
from ...utils import models_path_prefix from ...utils import models_path_prefix
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
V1 only supports xgrammar so this is irrelevant.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
def run_normal_opt125m(): def run_normal_opt125m():
prompts = [ prompts = [
"Hello, my name is", "Hello, my name is",
...@@ -46,8 +54,7 @@ def run_normal(): ...@@ -46,8 +54,7 @@ def run_normal():
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM without guided decoding as a baseline. # Create an LLM without guided decoding as a baseline.
llm = LLM(model="s3://vllm-ci-model-weights/distilgpt2", llm = LLM(model="distilbert/distilgpt2",
load_format=LoadFormat.RUNAI_STREAMER,
enforce_eager=True, enforce_eager=True,
gpu_memory_utilization=0.3) gpu_memory_utilization=0.3)
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
...@@ -63,8 +70,7 @@ def run_normal(): ...@@ -63,8 +70,7 @@ def run_normal():
def run_lmfe(sample_regex): def run_lmfe(sample_regex):
# Create an LLM with guided decoding enabled. # Create an LLM with guided decoding enabled.
llm = LLM(model=os.path.join(models_path_prefix, "distilgpt2"), llm = LLM(model=os.path.join(models_path_prefix, "distilbert/distilgpt2"),
load_format=LoadFormat.RUNAI_STREAMER,
enforce_eager=True, enforce_eager=True,
guided_decoding_backend="lm-format-enforcer", guided_decoding_backend="lm-format-enforcer",
gpu_memory_utilization=0.3) gpu_memory_utilization=0.3)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment