Commit ec5e299c authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.7.3' into v0.7.3-dev

parents 47bd229c ed6e9075
...@@ -26,7 +26,7 @@ from tests.models.utils import (TokensTextLogprobs, ...@@ -26,7 +26,7 @@ from tests.models.utils import (TokensTextLogprobs,
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset from vllm.assets.video import VideoAsset
from vllm.config import TaskOption, TokenizerPoolConfig from vllm.config import LoadFormat, TaskOption, TokenizerPoolConfig
from vllm.connections import global_http_connection from vllm.connections import global_http_connection
from vllm.distributed import (cleanup_dist_env_and_memory, from vllm.distributed import (cleanup_dist_env_and_memory,
init_distributed_environment, init_distributed_environment,
...@@ -49,6 +49,71 @@ _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")] ...@@ -49,6 +49,71 @@ _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
_SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt") _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
_M = TypeVar("_M") _M = TypeVar("_M")
MODELS_ON_S3 = [
"distilbert/distilgpt2",
"meta-llama/Llama-2-7b-hf",
"meta-llama/Meta-Llama-3-8B",
"meta-llama/Llama-3.2-1B",
"meta-llama/Llama-3.2-1B-Instruct",
"openai-community/gpt2",
"ArthurZ/Ilama-3.2-1B",
"llava-hf/llava-1.5-7b-hf",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"ai21labs/Jamba-tiny-random",
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
"nm-testing/Phi-3-mini-128k-instruct-FP8",
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
"AMead10/Llama-3.2-1B-Instruct-AWQ",
"shuyuej/Llama-3.2-1B-Instruct-GPTQ",
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head",
"ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024",
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
"amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
"nm-testing/tinyllama-oneshot-w4a16-group128-v2",
"nm-testing/tinyllama-oneshot-w8a16-per-channel",
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
"nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test",
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
"nm-testing/llama2.c-stories42M-pruned2.4-compressed",
]
MODEL_WEIGHTS_S3_BUCKET = models_path_prefix
_PromptMultiModalInput = Union[List[_M], List[List[_M]]] _PromptMultiModalInput = Union[List[_M], List[List[_M]]]
PromptImageInput = _PromptMultiModalInput[Image.Image] PromptImageInput = _PromptMultiModalInput[Image.Image]
...@@ -680,8 +745,14 @@ class VllmRunner: ...@@ -680,8 +745,14 @@ class VllmRunner:
enable_chunked_prefill: bool = False, enable_chunked_prefill: bool = False,
swap_space: int = 4, swap_space: int = 4,
enforce_eager: Optional[bool] = False, enforce_eager: Optional[bool] = False,
load_format: Optional[LoadFormat] = None,
**kwargs, **kwargs,
) -> None: ) -> None:
if model_name in MODELS_ON_S3 and not load_format:
model_name = (f"{MODEL_WEIGHTS_S3_BUCKET}/{model_name}")
load_format = LoadFormat.RUNAI_STREAMER
if not load_format:
load_format = LoadFormat.AUTO
self.model = LLM( self.model = LLM(
model=model_name, model=model_name,
task=task, task=task,
...@@ -696,6 +767,7 @@ class VllmRunner: ...@@ -696,6 +767,7 @@ class VllmRunner:
max_model_len=max_model_len, max_model_len=max_model_len,
block_size=block_size, block_size=block_size,
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
load_format=load_format,
**kwargs, **kwargs,
) )
......
...@@ -7,6 +7,9 @@ import pytest # noqa ...@@ -7,6 +7,9 @@ import pytest # noqa
from vllm.config import CacheConfig, SchedulerConfig from vllm.config import CacheConfig, SchedulerConfig
from vllm.core.scheduler import Scheduler from vllm.core.scheduler import Scheduler
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.llm_engine import LLMEngine
from vllm.sampling_params import SamplingParams
from vllm.sequence import Logprob, SequenceGroup from vllm.sequence import Logprob, SequenceGroup
from .utils import create_dummy_prompt from .utils import create_dummy_prompt
...@@ -16,7 +19,7 @@ def get_sequence_groups(scheduler_output): ...@@ -16,7 +19,7 @@ def get_sequence_groups(scheduler_output):
return [s.seq_group for s in scheduler_output.scheduled_seq_groups] return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
def append_new_token(seq_group, token_id: int): def append_new_token(seq_group: SequenceGroup, token_id: int):
for seq in seq_group.get_seqs(): for seq in seq_group.get_seqs():
seq.append_token_id(token_id, {token_id: Logprob(token_id)}) seq.append_token_id(token_id, {token_id: Logprob(token_id)})
...@@ -123,6 +126,232 @@ def test_chunk(): ...@@ -123,6 +126,232 @@ def test_chunk():
assert out.num_batched_tokens == 57 assert out.num_batched_tokens == 57
def test_concurrent_chunking():
"""Verify prefills are chunked properly when
--max-num-partial-prefills is > 1"""
block_size = 4
max_seqs = 60
max_model_len = 2000
max_num_batched_tokens = 64
scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens,
max_seqs,
max_model_len,
enable_chunked_prefill=True,
max_num_partial_prefills=2, # Up to 2 partial prefills at a time
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 32
cache_config.num_gpu_blocks = 32
scheduler = Scheduler(scheduler_config, cache_config, None)
running: List[SequenceGroup] = []
# Add seq groups to scheduler.
for i in range(2):
_, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
block_size=block_size)
scheduler.add_seq_group(seq_group)
running.append(seq_group)
# Verify both requests are chunked with half of max_num_batched_tokens each
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
assert set(get_sequence_groups(out)) == set(running)
assert seq_group_meta[0].token_chunk_size == 32
assert seq_group_meta[1].token_chunk_size == 32
assert out.num_prefill_groups == 2
assert out.num_batched_tokens == 64
# After one iteration, both should have 60 - 32 = 28 tokens left to prefill
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
assert set(get_sequence_groups(out)) == set(running)
assert seq_group_meta[0].token_chunk_size == 28
assert seq_group_meta[1].token_chunk_size == 28
assert out.num_prefill_groups == 2
assert out.num_batched_tokens == 56
def test_concurrent_chunking_large_requests():
"""Verify large prefill requests are run one at a time"""
block_size = 4
max_seqs = 60
max_model_len = 2000
max_num_batched_tokens = 64
scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens,
max_seqs,
max_model_len,
enable_chunked_prefill=True,
max_num_partial_prefills=2, # Up to 2 partial prefills at a time
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 3200 # large KV cache size for large requests
cache_config.num_gpu_blocks = 3200
scheduler = Scheduler(scheduler_config, cache_config, None)
# Add seq groups to scheduler.
for i in range(2):
_, seq_group = create_dummy_prompt(
str(i),
prompt_length=1200, # Very large prompt
block_size=block_size)
scheduler.add_seq_group(seq_group)
# Verify only a single request is chunked, and it gets all 64 tokens
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
assert len(get_sequence_groups(out)) == 1
assert seq_group_meta[0].token_chunk_size == 64
assert out.num_prefill_groups == 1
assert out.num_batched_tokens == 64
def test_short_prompts_jump_long_prompts_in_queue():
"""Verify large prefill requests are punted behind smaller ones if
another large prefill request is already running"""
block_size = 4
max_seqs = 60
max_model_len = 2000
max_num_batched_tokens = 64
scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens,
max_seqs,
max_model_len,
enable_chunked_prefill=True,
max_num_partial_prefills=2, # Up to 2 partial prefills at a time
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 3200 # large KV cache size for large requests
cache_config.num_gpu_blocks = 3200
scheduler = Scheduler(scheduler_config, cache_config, None)
long_seqs: List[SequenceGroup] = []
short_seqs: List[SequenceGroup] = []
# Add 2 large seq groups to scheduler.
for i in range(2):
_, seq_group = create_dummy_prompt(
str(i),
prompt_length=1200, # Very large prompt
block_size=block_size)
scheduler.add_seq_group(seq_group)
long_seqs.append(seq_group)
assert seq_group.is_prefill()
# Add 2 small seq groups behind them
for i in range(2):
_, seq_group = create_dummy_prompt(
str(i + 2),
prompt_length=40, # Very small prompt
block_size=block_size)
scheduler.add_seq_group(seq_group)
short_seqs.append(seq_group)
assert seq_group.is_prefill()
# Verify one large req and 1 small req chunked
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
assert seq_group_meta[0].token_chunk_size == 32 # large req gets 32 tokens
assert seq_group_meta[1].token_chunk_size == 32 # small req gets 32 tokens
# all 4 are prefilling
assert long_seqs[0].is_prefill()
assert long_seqs[1].is_prefill()
assert short_seqs[0].is_prefill()
assert short_seqs[1].is_prefill()
# First short and first long sequences have been scheduled
assert long_seqs[0].first_seq.get_num_computed_tokens() == 32
assert long_seqs[1].first_seq.get_num_computed_tokens() == 0
assert short_seqs[0].first_seq.get_num_computed_tokens() == 32
assert short_seqs[1].first_seq.get_num_computed_tokens() == 0
assert out.num_prefill_groups == 2
assert out.num_batched_tokens == 64
# in the second iteration,
# the first small request had only 8 tokens left
# so it went to decode
# The other small req is scheduled
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
# the new small req got 64 - (32+8) tokens
assert seq_group_meta[0].token_chunk_size == 24
assert seq_group_meta[1].token_chunk_size == 32 # large req still got 32
# the other small request had only 8 tokens left
assert seq_group_meta[2].token_chunk_size == 8 # 40-32
# The first small request got to decode now
assert long_seqs[0].is_prefill()
assert long_seqs[1].is_prefill()
assert not short_seqs[0].is_prefill()
assert short_seqs[1].is_prefill()
# Both small requests have started in front of the second long request
assert long_seqs[0].first_seq.get_num_computed_tokens() == 64
assert long_seqs[1].first_seq.get_num_computed_tokens() == 0
assert short_seqs[0].first_seq.get_num_computed_tokens() == 40
assert short_seqs[1].first_seq.get_num_computed_tokens() == 24
assert out.num_prefill_groups == 3
assert out.num_batched_tokens == 64
# the first small seq group has a new token appended.
append_new_token(short_seqs[0], 1)
# in the third iteration,
# the first small request is already decoding
# the second small request only has 16 tokens left and will enter decoding
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
assert seq_group_meta[0].token_chunk_size == 32 # large still got 32
# small req finished prefilling 40-24=16 tokens
assert seq_group_meta[1].token_chunk_size == 16
assert seq_group_meta[2].token_chunk_size == 1 # decode
assert out.num_prefill_groups == 2
assert out.num_batched_tokens == 49 # (32+16+1 decode)
# both small requests have now reached decode
assert long_seqs[0].is_prefill()
assert long_seqs[1].is_prefill()
assert not short_seqs[0].is_prefill()
assert not short_seqs[1].is_prefill()
assert long_seqs[0].first_seq.get_num_computed_tokens() == 96
assert long_seqs[1].first_seq.get_num_computed_tokens() == 0
assert short_seqs[0].first_seq.get_num_computed_tokens() == 41
assert short_seqs[1].first_seq.get_num_computed_tokens() == 40
# both the small seq groups have a new token appended
append_new_token(short_seqs[0], 1)
append_new_token(short_seqs[1], 1)
# in the fourth iteration, both small requests are decoding
# so large request gets all the budget
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
# large req gets 62 tokens (minus 2 for decode)
assert seq_group_meta[0].token_chunk_size == 62
assert seq_group_meta[1].token_chunk_size == 1 # decode
assert seq_group_meta[2].token_chunk_size == 1 # decode
assert out.num_prefill_groups == 1
assert out.num_batched_tokens == 64
assert long_seqs[0].first_seq.get_num_computed_tokens() == 158
# assert long_seqs[0].is_prefill()
# assert long_seqs[1].is_prefill()
# assert not short_seqs[0].is_prefill()
# assert not short_seqs[1].is_prefill()
# # both the small seq groups have a new token appended
# append_new_token(short_seqs[0], 1)
# append_new_token(short_seqs[1], 1)
# # in the fifth iteration, large request gets all the budget
# # while both small requests are decoding
# seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
# assert seq_group_meta[0].token_chunk_size == 62
# assert seq_group_meta[1].token_chunk_size == 1 # decode
# assert seq_group_meta[2].token_chunk_size == 1 # decode
# assert out.num_prefill_groups == 1
# assert out.num_batched_tokens == 64
def test_complex(): def test_complex():
block_size = 4 block_size = 4
max_seqs = 60 max_seqs = 60
...@@ -508,7 +737,7 @@ def test_chunked_prefill_max_seqs(): ...@@ -508,7 +737,7 @@ def test_chunked_prefill_max_seqs():
assert not running[1].is_prefill() assert not running[1].is_prefill()
def test_perfix_caching(): def test_prefix_caching():
"""Verify allocating full blocks when prefix caching is enabled.""" """Verify allocating full blocks when prefix caching is enabled."""
block_size = 4 block_size = 4
max_seqs = 10 max_seqs = 10
...@@ -548,3 +777,86 @@ def test_perfix_caching(): ...@@ -548,3 +777,86 @@ def test_perfix_caching():
assert seq_group_meta[1].token_chunk_size == 12 assert seq_group_meta[1].token_chunk_size == 12
assert out.num_prefill_groups == 2 assert out.num_prefill_groups == 2
assert out.num_batched_tokens == 62 assert out.num_batched_tokens == 62
def test_prefix_caching_with_concurrent_partial_prefills():
"""Verify allocating full blocks when prefix caching is enabled with
--max-num-partial-prefills > 1."""
block_size = 4
max_seqs = 10
max_model_len = 8000
max_num_batched_tokens = 60 # With two slots, each slot will get 30 tokens
scheduler_config = SchedulerConfig("generate",
max_num_batched_tokens,
max_seqs,
max_model_len,
enable_chunked_prefill=True,
max_num_partial_prefills=2)
cache_config = CacheConfig(block_size,
1.0,
1,
"auto",
enable_prefix_caching=True)
cache_config.num_cpu_blocks = 0
cache_config.num_gpu_blocks = 32
scheduler = Scheduler(scheduler_config, cache_config, None)
running: List[SequenceGroup] = []
# Add seq groups to scheduler.
for i in range(2):
_, seq_group = create_dummy_prompt(str(i),
block_size=block_size,
prompt_length=50)
scheduler.add_seq_group(seq_group)
running.append(seq_group)
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
assert set(get_sequence_groups(out)) == set(running)
# To partially prefill both sequences, both can chunk up to 30 tokens
# But the next lowest multiple of the block size (4) is 28
assert seq_group_meta[0].token_chunk_size == 28
assert seq_group_meta[1].token_chunk_size == 28
assert out.num_prefill_groups == 2
assert out.num_batched_tokens == 56
# On the next iteration, both sequences should finish prefill
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
assert set(get_sequence_groups(out)) == set(running)
# Both sequences have 50 - 28 = 22 tokens left to prefill.
# This is not a multiple of the block size, but we don't care since we don't
# cache the final partial block of prefix sequences
assert seq_group_meta[0].token_chunk_size == 22
assert seq_group_meta[1].token_chunk_size == 22
assert out.num_prefill_groups == 2
assert out.num_batched_tokens == 44
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("max_num_partial_prefills", [2, 4, 8])
def test_chunked_prefill_with_actual_engine(model: str,
max_num_partial_prefills: int):
"""Make sure the model can actually sample with concurrent
partial prefills
"""
prompt = "hello" * 40
engine_args = EngineArgs(
model=model,
max_num_partial_prefills=max_num_partial_prefills,
max_num_batched_tokens=40,
max_num_seqs=8,
enable_chunked_prefill=True,
gpu_memory_utilization=0.8,
)
engine = LLMEngine.from_engine_args(engine_args)
sampling_params = SamplingParams(temperature=0)
for req_num in range(max_num_partial_prefills):
engine.add_request(f"{req_num}", prompt, sampling_params)
# first step
request_outputs = engine.step()
# means all are prefilling
assert len(request_outputs) == 0
assert len(engine.scheduler[0].running) == max_num_partial_prefills
...@@ -22,7 +22,7 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int, ...@@ -22,7 +22,7 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs # so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU # they will be able to set the device to the correct GPU
del os.environ["CUDA_VISIBLE_DEVICES"] os.environ.pop("CUDA_VISIBLE_DEVICES", None)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
...@@ -44,7 +44,7 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int, ...@@ -44,7 +44,7 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs # so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU # they will be able to set the device to the correct GPU
del os.environ["CUDA_VISIBLE_DEVICES"] os.environ.pop("CUDA_VISIBLE_DEVICES", None)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
...@@ -72,7 +72,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, ...@@ -72,7 +72,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs # so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU # they will be able to set the device to the correct GPU
del os.environ["CUDA_VISIBLE_DEVICES"] os.environ.pop("CUDA_VISIBLE_DEVICES", None)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
...@@ -108,7 +108,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, ...@@ -108,7 +108,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
distributed_init_port: str): distributed_init_port: str):
del os.environ["CUDA_VISIBLE_DEVICES"] os.environ.pop("CUDA_VISIBLE_DEVICES", None)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
...@@ -148,7 +148,7 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, ...@@ -148,7 +148,7 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def send_recv_test_worker(tp_size: int, pp_size: int, rank: int, def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
distributed_init_port: str): distributed_init_port: str):
del os.environ["CUDA_VISIBLE_DEVICES"] os.environ.pop("CUDA_VISIBLE_DEVICES", None)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
......
...@@ -24,7 +24,7 @@ for i, v in enumerate(test_sizes): ...@@ -24,7 +24,7 @@ for i, v in enumerate(test_sizes):
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def graph_allreduce(tp_size, pp_size, rank, distributed_init_port): def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
del os.environ["CUDA_VISIBLE_DEVICES"] os.environ.pop("CUDA_VISIBLE_DEVICES", None)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
...@@ -80,7 +80,7 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port): ...@@ -80,7 +80,7 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def eager_allreduce(tp_size, pp_size, rank, distributed_init_port): def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
del os.environ["CUDA_VISIBLE_DEVICES"] os.environ.pop("CUDA_VISIBLE_DEVICES", None)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
......
...@@ -6,6 +6,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node ...@@ -6,6 +6,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
all workers in a node other than the head node, which can cause the test all workers in a node other than the head node, which can cause the test
to fail. to fail.
""" """
import json
import os import os
from dataclasses import dataclass from dataclasses import dataclass
from typing import List, Literal, NamedTuple, Optional from typing import List, Literal, NamedTuple, Optional
...@@ -15,6 +16,7 @@ import pytest ...@@ -15,6 +16,7 @@ import pytest
from vllm.config import TaskOption from vllm.config import TaskOption
from vllm.logger import init_logger from vllm.logger import init_logger
from ..models.registry import HF_EXAMPLE_MODELS
from ..utils import compare_two_settings, fork_new_process_for_each_test, models_path_prefix from ..utils import compare_two_settings, fork_new_process_for_each_test, models_path_prefix
logger = init_logger("test_pipeline_parallel") logger = init_logger("test_pipeline_parallel")
...@@ -31,19 +33,29 @@ class ParallelSetup(NamedTuple): ...@@ -31,19 +33,29 @@ class ParallelSetup(NamedTuple):
class PPTestOptions(NamedTuple): class PPTestOptions(NamedTuple):
multi_node_only: bool multi_node_only: bool
trust_remote_code: bool
tokenizer_mode: Optional[str]
load_format: Optional[str] = None load_format: Optional[str] = None
hf_overrides: Optional[str] = None
@dataclass @dataclass
class PPTestSettings: class PPTestSettings:
parallel_setups: List[ParallelSetup] parallel_setups: List[ParallelSetup]
# NOTE: the length of distributed_backends and
# vllm_major_versions should be the same, and they
# are first zipped together to iterate over all
# test settings.
distributed_backends: List[str] distributed_backends: List[str]
# vllm major version: "0" for V0, "1" for V1
vllm_major_versions: List[str]
task: TaskOption task: TaskOption
test_options: PPTestOptions test_options: PPTestOptions
def __post_init__(self):
if len(self.distributed_backends) != len(self.vllm_major_versions):
raise ValueError(
f"Length mismatch: distributed_backends "
f"({len(self.distributed_backends)}) != "
f"vllm_major_versions ({len(self.vllm_major_versions)})")
@staticmethod @staticmethod
def detailed( def detailed(
*, *,
...@@ -51,10 +63,7 @@ class PPTestSettings: ...@@ -51,10 +63,7 @@ class PPTestSettings:
pp_base: int = 2, pp_base: int = 2,
multi_node_only: bool = False, multi_node_only: bool = False,
task: TaskOption = "auto", task: TaskOption = "auto",
trust_remote_code: bool = False,
tokenizer_mode: Optional[str] = None,
load_format: Optional[str] = None, load_format: Optional[str] = None,
hf_overrides: Optional[str] = None,
): ):
return PPTestSettings( return PPTestSettings(
parallel_setups=[ parallel_setups=[
...@@ -79,13 +88,12 @@ class PPTestSettings: ...@@ -79,13 +88,12 @@ class PPTestSettings:
eager_mode=True, eager_mode=True,
chunked_prefill=False), chunked_prefill=False),
], ],
distributed_backends=["mp", "ray"], # only ray is supported for V1
distributed_backends=["mp", "ray", "ray"],
vllm_major_versions=["0", "0", "1"],
task=task, task=task,
test_options=PPTestOptions(multi_node_only=multi_node_only, test_options=PPTestOptions(multi_node_only=multi_node_only,
trust_remote_code=trust_remote_code, load_format=load_format),
tokenizer_mode=tokenizer_mode,
load_format=load_format,
hf_overrides=hf_overrides),
) )
@staticmethod @staticmethod
...@@ -95,10 +103,7 @@ class PPTestSettings: ...@@ -95,10 +103,7 @@ class PPTestSettings:
pp_base: int = 2, pp_base: int = 2,
task: TaskOption = "auto", task: TaskOption = "auto",
multi_node_only: bool = False, multi_node_only: bool = False,
trust_remote_code: bool = False,
tokenizer_mode: Optional[str] = None,
load_format: Optional[str] = None, load_format: Optional[str] = None,
hf_overrides: Optional[str] = None,
): ):
return PPTestSettings( return PPTestSettings(
parallel_setups=[ parallel_setups=[
...@@ -108,20 +113,19 @@ class PPTestSettings: ...@@ -108,20 +113,19 @@ class PPTestSettings:
chunked_prefill=False), chunked_prefill=False),
], ],
distributed_backends=["mp"], distributed_backends=["mp"],
vllm_major_versions=["0"],
task=task, task=task,
test_options=PPTestOptions(multi_node_only=multi_node_only, test_options=PPTestOptions(multi_node_only=multi_node_only,
trust_remote_code=trust_remote_code, load_format=load_format),
tokenizer_mode=tokenizer_mode,
load_format=load_format,
hf_overrides=hf_overrides),
) )
def iter_params(self, model_name: str): def iter_params(self, model_id: str):
opts = self.test_options opts = self.test_options
for parallel_setup in self.parallel_setups: for parallel_setup in self.parallel_setups:
for distributed_backend in self.distributed_backends: for backend, vllm_major_version in zip(self.distributed_backends,
yield (model_name, parallel_setup, distributed_backend, self.vllm_major_versions):
yield (model_id, parallel_setup, backend, vllm_major_version,
self.task, opts) self.task, opts)
...@@ -133,16 +137,16 @@ TEXT_GENERATION_MODELS = { ...@@ -133,16 +137,16 @@ TEXT_GENERATION_MODELS = {
# [Decoder-only] # [Decoder-only]
# Uses Llama # Uses Llama
# "BAAI/AquilaChat-7B": PPTestSettings.fast(), # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
os.path.join(models_path_prefix, "Snowflake/snowflake-arctic-instruct"): PPTestSettings.fast(tp_base=8, trust_remote_code=True), # noqa: E501 os.path.join(models_path_prefix, "Snowflake/snowflake-arctic-instruct"): PPTestSettings.fast(load_format="dummy"), # noqa: E501
os.path.join(models_path_prefix, "baichuan-inc/Baichuan-7B"): PPTestSettings.fast(trust_remote_code=True), os.path.join(models_path_prefix, "baichuan-inc/Baichuan-7B"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "baichuan-inc/Baichuan2-13B-Chat"): PPTestSettings.fast(trust_remote_code=True), # noqa: E501 os.path.join(models_path_prefix, "baichuan-inc/Baichuan2-13B-Chat"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "bigscience/bloomz-1b1"): PPTestSettings.fast(), os.path.join(models_path_prefix, "bigscience/bloomz-1b1"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "THUDM/chatglm3-6b"): PPTestSettings.fast(trust_remote_code=True), os.path.join(models_path_prefix, "THUDM/chatglm3-6b"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "CohereForAI/c4ai-command-r-v01"): PPTestSettings.fast(tp_base=2, trust_remote_code=True), # noqa: E501 os.path.join(models_path_prefix, "CohereForAI/c4ai-command-r-v01"): PPTestSettings.fast(load_format="dummy"),
os.path.join(models_path_prefix, "databricks/dbrx-instruct"): PPTestSettings.fast(tp_base=8), os.path.join(models_path_prefix, "databricks/dbrx-instruct"): PPTestSettings.fast(load_format="dummy"),
os.path.join(models_path_prefix, "Deci/DeciLM-7B-instruct"): PPTestSettings.fast(trust_remote_code=True), os.path.join(models_path_prefix, "Deci/DeciLM-7B-instruct"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "deepseek-ai/deepseek-llm-7b-chat"): PPTestSettings.fast(), os.path.join(models_path_prefix, "deepseek-ai/deepseek-llm-7b-chat"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "deepseek-ai/DeepSeek-V2-Lite-Chat"): PPTestSettings.fast(trust_remote_code=True), # noqa: E501 os.path.join(models_path_prefix, "deepseek-ai/DeepSeek-V2-Lite-Chat"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"): PPTestSettings.fast(), os.path.join(models_path_prefix, "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "tiiuae/falcon-7b"): PPTestSettings.fast(), os.path.join(models_path_prefix, "tiiuae/falcon-7b"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "google/gemma-2b"): PPTestSettings.fast(), os.path.join(models_path_prefix, "google/gemma-2b"): PPTestSettings.fast(),
...@@ -155,36 +159,36 @@ TEXT_GENERATION_MODELS = { ...@@ -155,36 +159,36 @@ TEXT_GENERATION_MODELS = {
os.path.join(models_path_prefix, "ibm/PowerMoE-3b"): PPTestSettings.fast(), os.path.join(models_path_prefix, "ibm/PowerMoE-3b"): PPTestSettings.fast(),
# Uses Llama # Uses Llama
# "internlm/internlm-chat-7b": PPTestSettings.fast(), # "internlm/internlm-chat-7b": PPTestSettings.fast(),
os.path.join(models_path_prefix, "internlm/internlm2-chat-7b"): PPTestSettings.fast(trust_remote_code=True), os.path.join(models_path_prefix, "internlm/internlm2-chat-7b"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "inceptionai/jais-13b-chat"): PPTestSettings.fast(), os.path.join(models_path_prefix, "inceptionai/jais-13b-chat"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-dev"): PPTestSettings.fast(), os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-dev"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"): PPTestSettings.detailed(), os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"): PPTestSettings.detailed(),
os.path.join(models_path_prefix, "openbmb/MiniCPM-2B-sft-bf16"): PPTestSettings.fast(trust_remote_code=True), os.path.join(models_path_prefix, "openbmb/MiniCPM-2B-sft-bf16"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "openbmb/MiniCPM3-4B"): PPTestSettings.fast(trust_remote_code=True), os.path.join(models_path_prefix, "openbmb/MiniCPM3-4B"): PPTestSettings.fast(),
# Uses Llama # Uses Llama
# "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(), # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
os.path.join(models_path_prefix, "state-spaces/mamba-130m-hf"): PPTestSettings.fast(), os.path.join(models_path_prefix, "state-spaces/mamba-130m-hf"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "mistralai/Mixtral-8x7B-Instruct-v0.1"): PPTestSettings.fast(tp_base=4), os.path.join(models_path_prefix, "mistralai/Mixtral-8x7B-Instruct-v0.1"): PPTestSettings.fast(load_format="dummy"), # noqa: E501
os.path.join(models_path_prefix, "mosaicml/mpt-7b"): PPTestSettings.fast(), os.path.join(models_path_prefix, "mosaicml/mpt-7b"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "nvidia/Minitron-8B-Base"): PPTestSettings.fast(), os.path.join(models_path_prefix, "nvidia/Minitron-8B-Base"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "allenai/OLMo-1B-hf"): PPTestSettings.fast(), os.path.join(models_path_prefix, "allenai/OLMo-1B-hf"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "shanearora/OLMo-7B-1124-hf"): PPTestSettings.fast(), os.path.join(models_path_prefix, "shanearora/OLMo-7B-1124-hf"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "allenai/OLMoE-1B-7B-0924-Instruct"): PPTestSettings.fast(), os.path.join(models_path_prefix, "allenai/OLMoE-1B-7B-0924-Instruct"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "facebook/opt-iml-max-1.3b"): PPTestSettings.fast(), os.path.join(models_path_prefix, "facebook/opt-iml-max-1.3b"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "OrionStarAI/Orion-14B-Chat"): PPTestSettings.fast(trust_remote_code=True), os.path.join(models_path_prefix, "OrionStarAI/Orion-14B-Chat"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "adept/persimmon-8b-chat"): PPTestSettings.fast(), os.path.join(models_path_prefix, "adept/persimmon-8b-chat"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "microsoft/phi-2"): PPTestSettings.fast(), os.path.join(models_path_prefix, "microsoft/phi-2"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "microsoft/Phi-3-small-8k-instruct"): PPTestSettings.fast(trust_remote_code=True), # noqa: E501 os.path.join(models_path_prefix, "microsoft/Phi-3-small-8k-instruct"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "microsoft/Phi-3.5-MoE-instruct"): PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True, load_format="dummy", hf_overrides='{"num_hidden_layers": 4, "hidden_size": 512, "intermediate_size": 800, "num_attention_heads": 4, "num_key_value_heads": 1}'), # noqa: E501 os.path.join(models_path_prefix, "microsoft/Phi-3.5-MoE-instruct"): PPTestSettings.detailed(multi_node_only=True, load_format="dummy"), # noqa: E501
os.path.join(models_path_prefix, "Qwen/Qwen-7B-Chat"): PPTestSettings.fast(trust_remote_code=True), os.path.join(models_path_prefix, "Qwen/Qwen-7B-Chat"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "Qwen/Qwen2-7B-Instruct"): PPTestSettings.fast(), os.path.join(models_path_prefix, "Qwen/Qwen2-7B-Instruct"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "Qwen/Qwen1.5-MoE-A2.7B-Chat"): PPTestSettings.fast(), os.path.join(models_path_prefix, "Qwen/Qwen1.5-MoE-A2.7B-Chat"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "stabilityai/stablelm-3b-4e1t"): PPTestSettings.fast(), os.path.join(models_path_prefix, "stabilityai/stablelm-3b-4e1t"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "bigcode/starcoder2-3b"): PPTestSettings.fast(), os.path.join(models_path_prefix, "bigcode/starcoder2-3b"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "upstage/solar-pro-preview-instruct"): PPTestSettings.fast(tp_base=2), os.path.join(models_path_prefix, "upstage/solar-pro-preview-instruct"): PPTestSettings.fast(load_format="dummy"), # noqa: E501
# FIXME: Cannot load tokenizer in latest transformers version. # FIXME: Cannot load tokenizer in latest transformers version.
# Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf` # Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
# "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True), # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(),
# [Encoder-only] # [Encoder-only]
# TODO: Implement PP # TODO: Implement PP
# "facebook/bart-base": PPTestSettings.fast(), # "facebook/bart-base": PPTestSettings.fast(),
...@@ -192,9 +196,9 @@ TEXT_GENERATION_MODELS = { ...@@ -192,9 +196,9 @@ TEXT_GENERATION_MODELS = {
EMBEDDING_MODELS = { # type: ignore[var-annotated] EMBEDDING_MODELS = { # type: ignore[var-annotated]
# [Text-only] # [Text-only]
"intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(), os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"): PPTestSettings.fast(),
"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(), os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2"): PPTestSettings.fast(),
"Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(tp_base=4, trust_remote_code=True), # noqa: E501 os.path.join(models_path_prefix, "Qwen/Qwen2.5-Math-RM-72B"): PPTestSettings.fast(load_format="dummy"),
} }
MULTIMODAL_MODELS = { MULTIMODAL_MODELS = {
...@@ -202,20 +206,20 @@ MULTIMODAL_MODELS = { ...@@ -202,20 +206,20 @@ MULTIMODAL_MODELS = {
os.path.join(models_path_prefix, "Salesforce/blip2-opt-2.7b"): PPTestSettings.fast(), os.path.join(models_path_prefix, "Salesforce/blip2-opt-2.7b"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "facebook/chameleon-7b"): PPTestSettings.fast(), os.path.join(models_path_prefix, "facebook/chameleon-7b"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "adept/fuyu-8b"): PPTestSettings.fast(), os.path.join(models_path_prefix, "adept/fuyu-8b"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "THUDM/glm-4v-9b"): PPTestSettings.fast(trust_remote_code=True), os.path.join(models_path_prefix, "THUDM/glm-4v-9b"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "OpenGVLab/InternVL2-1B"): PPTestSettings.fast(trust_remote_code=True), os.path.join(models_path_prefix, "OpenGVLab/InternVL2-1B"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"): PPTestSettings.fast(), os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf"): PPTestSettings.fast(), os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "llava-hf/LLaVA-NeXT-Video-7B-hf"): PPTestSettings.fast(), os.path.join(models_path_prefix, "llava-hf/LLaVA-NeXT-Video-7B-hf"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"): PPTestSettings.fast(), os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "openbmb/MiniCPM-Llama3-V-2_5"): PPTestSettings.fast(trust_remote_code=True), os.path.join(models_path_prefix, "openbmb/MiniCPM-Llama3-V-2_5"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "allenai/Molmo-7B-D-0924"): PPTestSettings.fast(trust_remote_code=True), os.path.join(models_path_prefix, "allenai/Molmo-7B-D-0924"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "microsoft/Phi-3-vision-128k-instruct"): PPTestSettings.fast(trust_remote_code=True), # noqa: E501 os.path.join(models_path_prefix, "microsoft/Phi-3-vision-128k-instruct"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "mistralai/Pixtral-12B-2409"): PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"), # noqa: E501 os.path.join(models_path_prefix, "mistralai/Pixtral-12B-2409"): PPTestSettings.fast(load_format="dummy"),
os.path.join(models_path_prefix, "Qwen/Qwen-VL-Chat"): PPTestSettings.fast(trust_remote_code=True), os.path.join(models_path_prefix, "Qwen/Qwen-VL-Chat"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "Qwen/Qwen2-Audio-7B-Instruct"): PPTestSettings.fast(), os.path.join(models_path_prefix, "Qwen/Qwen2-Audio-7B-Instruct"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct"): PPTestSettings.fast(), os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_3"): PPTestSettings.fast(trust_remote_code=True), os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_5-llama-3_2-1b"): PPTestSettings.fast(),
# [Encoder-decoder] # [Encoder-decoder]
# TODO: Implement PP # TODO: Implement PP
# "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(), # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
...@@ -226,7 +230,7 @@ MULTIMODAL_MODELS = { ...@@ -226,7 +230,7 @@ MULTIMODAL_MODELS = {
TEST_MODELS = [ TEST_MODELS = [
# [LANGUAGE GENERATION] # [LANGUAGE GENERATION]
os.path.join(models_path_prefix, "microsoft/Phi-3.5-MoE-instruct"), os.path.join(models_path_prefix, "microsoft/Phi-3.5-MoE-instruct"),
os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
os.path.join(models_path_prefix, "ibm/PowerLM-3b"), os.path.join(models_path_prefix, "ibm/PowerLM-3b"),
# [LANGUAGE EMBEDDING] # [LANGUAGE EMBEDDING]
os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"), os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"),
...@@ -234,21 +238,23 @@ TEST_MODELS = [ ...@@ -234,21 +238,23 @@ TEST_MODELS = [
# [MULTIMODAL GENERATION] # [MULTIMODAL GENERATION]
os.path.join(models_path_prefix, "OpenGVLab/InternVL2-1B"), os.path.join(models_path_prefix, "OpenGVLab/InternVL2-1B"),
os.path.join(models_path_prefix, "microsoft/Phi-3-vision-128k-instruct"), os.path.join(models_path_prefix, "microsoft/Phi-3-vision-128k-instruct"),
os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_3"), os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_5-llama-3_2-1b"),
# [LANGUAGE GENERATION - HYBRID ARCH] # [LANGUAGE GENERATION - HYBRID ARCH]
os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-dev"), os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-dev"),
] ]
def _compare_tp( def _compare_tp(
model_name: str, model_id: str,
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
vllm_major_version: str,
task: TaskOption, task: TaskOption,
test_options: PPTestOptions, test_options: PPTestOptions,
num_gpus_available: int, num_gpus_available: int,
*, *,
method: Literal["generate", "encode"], method: Literal["generate", "encode"],
is_multimodal: bool,
): ):
( (
tp_size, tp_size,
...@@ -256,13 +262,32 @@ def _compare_tp( ...@@ -256,13 +262,32 @@ def _compare_tp(
eager_mode, eager_mode,
chunked_prefill, chunked_prefill,
) = parallel_setup ) = parallel_setup
(
multi_node_only, multi_node_only, load_format = test_options
trust_remote_code,
tokenizer_mode, model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
load_format, model_info.check_transformers_version(on_fail="skip")
hf_overrides,
) = test_options trust_remote_code = model_info.trust_remote_code
tokenizer_mode = model_info.tokenizer_mode
hf_overrides = model_info.hf_overrides
if load_format == "dummy":
# Avoid OOM
text_overrides = {
"num_hidden_layers": 4,
"hidden_size": 512,
"intermediate_size": 800,
"num_attention_heads": 4,
"num_key_value_heads": 1,
}
if is_multimodal:
hf_overrides.update({"text_config": text_overrides})
else:
hf_overrides.update(text_overrides)
else:
model_info.check_available_online(on_fail="skip")
if num_gpus_available < tp_size * pp_size: if num_gpus_available < tp_size * pp_size:
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs") pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
...@@ -294,12 +319,15 @@ def _compare_tp( ...@@ -294,12 +319,15 @@ def _compare_tp(
if load_format: if load_format:
common_args.extend(["--load-format", load_format]) common_args.extend(["--load-format", load_format])
if hf_overrides: if hf_overrides:
common_args.extend(["--hf-overrides", hf_overrides]) common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
if (distributed_backend == "ray" and tp_size == 2 and pp_size == 2 specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
and chunked_prefill): if distributed_backend == "ray" and (vllm_major_version == "1"
# Test Ray ADAG for a subset of the tests or specific_case):
# For V1, test Ray ADAG for all the tests
# For V0, test Ray ADAG for a subset of the tests
pp_env = { pp_env = {
"VLLM_USE_V1": vllm_major_version,
"VLLM_USE_RAY_COMPILED_DAG": "1", "VLLM_USE_RAY_COMPILED_DAG": "1",
"VLLM_USE_RAY_SPMD_WORKER": "1", "VLLM_USE_RAY_SPMD_WORKER": "1",
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1", "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
...@@ -334,11 +362,7 @@ def _compare_tp( ...@@ -334,11 +362,7 @@ def _compare_tp(
] ]
try: try:
compare_two_settings(model_name, compare_two_settings(model_id, pp_args, tp_args, pp_env, method=method)
pp_args,
tp_args,
pp_env,
method=method)
except Exception: except Exception:
if pp_env is None: if pp_env is None:
raise raise
...@@ -348,81 +372,87 @@ def _compare_tp( ...@@ -348,81 +372,87 @@ def _compare_tp(
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend", "task", ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
"test_options"), "task", "test_options"),
[ [
params for model_name, settings in TEXT_GENERATION_MODELS.items() params for model_id, settings in TEXT_GENERATION_MODELS.items()
for params in settings.iter_params(model_name) for params in settings.iter_params(model_id) if model_id in TEST_MODELS
if model_name in TEST_MODELS
], ],
) )
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_tp_language_generation( def test_tp_language_generation(
model_name: str, model_id: str,
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
vllm_major_version: str,
task: TaskOption, task: TaskOption,
test_options: PPTestOptions, test_options: PPTestOptions,
num_gpus_available, num_gpus_available,
): ):
_compare_tp(model_name, _compare_tp(model_id,
parallel_setup, parallel_setup,
distributed_backend, distributed_backend,
vllm_major_version,
task, task,
test_options, test_options,
num_gpus_available, num_gpus_available,
method="generate") method="generate",
is_multimodal=False)
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend", "task", ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
"test_options"), "task", "test_options"),
[ [
params for model_name, settings in EMBEDDING_MODELS.items() params for model_id, settings in EMBEDDING_MODELS.items()
for params in settings.iter_params(model_name) for params in settings.iter_params(model_id) if model_id in TEST_MODELS
if model_name in TEST_MODELS
], ],
) )
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_tp_language_embedding( def test_tp_language_embedding(
model_name: str, model_id: str,
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
vllm_major_version: str,
task: TaskOption, task: TaskOption,
test_options: PPTestOptions, test_options: PPTestOptions,
num_gpus_available, num_gpus_available,
): ):
_compare_tp(model_name, _compare_tp(model_id,
parallel_setup, parallel_setup,
distributed_backend, distributed_backend,
vllm_major_version,
task, task,
test_options, test_options,
num_gpus_available, num_gpus_available,
method="encode") method="encode",
is_multimodal=False)
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend", "task", ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
"test_options"), "task", "test_options"),
[ [
params for model_name, settings in MULTIMODAL_MODELS.items() params for model_id, settings in MULTIMODAL_MODELS.items()
for params in settings.iter_params(model_name) for params in settings.iter_params(model_id) if model_id in TEST_MODELS
if model_name in TEST_MODELS
], ],
) )
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_tp_multimodal_generation( def test_tp_multimodal_generation(
model_name: str, model_id: str,
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
vllm_major_version: str,
task: TaskOption, task: TaskOption,
test_options: PPTestOptions, test_options: PPTestOptions,
num_gpus_available, num_gpus_available,
): ):
_compare_tp(model_name, _compare_tp(model_id,
parallel_setup, parallel_setup,
distributed_backend, distributed_backend,
vllm_major_version,
task, task,
test_options, test_options,
num_gpus_available, num_gpus_available,
method="generate") method="generate",
is_multimodal=True)
...@@ -2,14 +2,16 @@ ...@@ -2,14 +2,16 @@
import pytest import pytest
from vllm.config import LoadFormat
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.engine.llm_engine import LLMEngine from vllm.engine.llm_engine import LLMEngine
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
import os
from ..utils import models_path_prefix
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
@pytest.mark.parametrize("model",
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
@pytest.mark.parametrize("block_size", [16]) @pytest.mark.parametrize("block_size", [16])
def test_computed_prefix_blocks(model: str, block_size: int): def test_computed_prefix_blocks(model: str, block_size: int):
# This test checks if we are able to run the engine to completion # This test checks if we are able to run the engine to completion
...@@ -26,6 +28,7 @@ def test_computed_prefix_blocks(model: str, block_size: int): ...@@ -26,6 +28,7 @@ def test_computed_prefix_blocks(model: str, block_size: int):
"decoration.") "decoration.")
engine_args = EngineArgs(model=model, engine_args = EngineArgs(model=model,
load_format=LoadFormat.RUNAI_STREAMER,
block_size=block_size, block_size=block_size,
enable_prefix_caching=True) enable_prefix_caching=True)
......
...@@ -2,13 +2,15 @@ ...@@ -2,13 +2,15 @@
import pytest import pytest
from vllm.config import LoadFormat
from vllm.entrypoints.llm import LLM from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
import os
from ..utils import models_path_prefix
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
@pytest.mark.parametrize("model",
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
def test_computed_prefix_blocks(model: str): def test_computed_prefix_blocks(model: str):
# This test checks if the engine generates completions both with and # This test checks if the engine generates completions both with and
# without optional detokenization, that detokenization includes text # without optional detokenization, that detokenization includes text
...@@ -19,7 +21,7 @@ def test_computed_prefix_blocks(model: str): ...@@ -19,7 +21,7 @@ def test_computed_prefix_blocks(model: str):
"paper clips? Is there an easy to follow video tutorial available " "paper clips? Is there an easy to follow video tutorial available "
"online for free?") "online for free?")
llm = LLM(model=model) llm = LLM(model=model, load_format=LoadFormat.RUNAI_STREAMER)
sampling_params = SamplingParams(max_tokens=10, sampling_params = SamplingParams(max_tokens=10,
temperature=0.0, temperature=0.0,
detokenize=False) detokenize=False)
......
...@@ -6,6 +6,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union ...@@ -6,6 +6,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import pytest import pytest
from vllm.config import LoadFormat
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.llm_engine import LLMEngine from vllm.engine.llm_engine import LLMEngine
...@@ -14,6 +15,10 @@ from vllm.sampling_params import SamplingParams ...@@ -14,6 +15,10 @@ from vllm.sampling_params import SamplingParams
import os import os
from ..utils import models_path_prefix from ..utils import models_path_prefix
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
class Mock: class Mock:
... ...
...@@ -34,11 +39,12 @@ class CustomUniExecutor(UniProcExecutor): ...@@ -34,11 +39,12 @@ class CustomUniExecutor(UniProcExecutor):
CustomUniExecutorAsync = CustomUniExecutor CustomUniExecutorAsync = CustomUniExecutor
@pytest.mark.parametrize("model",
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")]) [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
def test_custom_executor_type_checking(model): def test_custom_executor_type_checking(model):
with pytest.raises(ValueError): with pytest.raises(ValueError):
engine_args = EngineArgs(model=model, engine_args = EngineArgs(model=model,
load_format=RUNAI_STREAMER_LOAD_FORMAT,
distributed_executor_backend=Mock) distributed_executor_backend=Mock)
LLMEngine.from_engine_args(engine_args) LLMEngine.from_engine_args(engine_args)
with pytest.raises(ValueError): with pytest.raises(ValueError):
...@@ -47,7 +53,8 @@ def test_custom_executor_type_checking(model): ...@@ -47,7 +53,8 @@ def test_custom_executor_type_checking(model):
AsyncLLMEngine.from_engine_args(engine_args) AsyncLLMEngine.from_engine_args(engine_args)
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")]) @pytest.mark.parametrize("model",
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
def test_custom_executor(model, tmp_path): def test_custom_executor(model, tmp_path):
cwd = os.path.abspath(".") cwd = os.path.abspath(".")
os.chdir(tmp_path) os.chdir(tmp_path)
...@@ -56,7 +63,9 @@ def test_custom_executor(model, tmp_path): ...@@ -56,7 +63,9 @@ def test_custom_executor(model, tmp_path):
engine_args = EngineArgs( engine_args = EngineArgs(
model=model, model=model,
load_format=RUNAI_STREAMER_LOAD_FORMAT,
distributed_executor_backend=CustomUniExecutor, distributed_executor_backend=CustomUniExecutor,
enforce_eager=True, # reduce test time
) )
engine = LLMEngine.from_engine_args(engine_args) engine = LLMEngine.from_engine_args(engine_args)
sampling_params = SamplingParams(max_tokens=1) sampling_params = SamplingParams(max_tokens=1)
...@@ -69,7 +78,8 @@ def test_custom_executor(model, tmp_path): ...@@ -69,7 +78,8 @@ def test_custom_executor(model, tmp_path):
os.chdir(cwd) os.chdir(cwd)
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")]) @pytest.mark.parametrize("model",
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
def test_custom_executor_async(model, tmp_path): def test_custom_executor_async(model, tmp_path):
cwd = os.path.abspath(".") cwd = os.path.abspath(".")
os.chdir(tmp_path) os.chdir(tmp_path)
...@@ -77,7 +87,11 @@ def test_custom_executor_async(model, tmp_path): ...@@ -77,7 +87,11 @@ def test_custom_executor_async(model, tmp_path):
assert not os.path.exists(".marker") assert not os.path.exists(".marker")
engine_args = AsyncEngineArgs( engine_args = AsyncEngineArgs(
model=model, distributed_executor_backend=CustomUniExecutorAsync) model=model,
load_format=RUNAI_STREAMER_LOAD_FORMAT,
distributed_executor_backend=CustomUniExecutorAsync,
enforce_eager=True, # reduce test time
)
engine = AsyncLLMEngine.from_engine_args(engine_args) engine = AsyncLLMEngine.from_engine_args(engine_args)
sampling_params = SamplingParams(max_tokens=1) sampling_params = SamplingParams(max_tokens=1)
...@@ -91,3 +105,20 @@ def test_custom_executor_async(model, tmp_path): ...@@ -91,3 +105,20 @@ def test_custom_executor_async(model, tmp_path):
assert os.path.exists(".marker") assert os.path.exists(".marker")
finally: finally:
os.chdir(cwd) os.chdir(cwd)
@pytest.mark.parametrize("model",
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
def test_respect_ray(model):
# even for TP=1 and PP=1,
# if users specify ray, we should use ray.
# users might do this if they want to manage the
# resources using ray.
engine_args = EngineArgs(
model=model,
distributed_executor_backend="ray",
load_format=RUNAI_STREAMER_LOAD_FORMAT,
enforce_eager=True, # reduce test time
)
engine = LLMEngine.from_engine_args(engine_args)
assert engine.model_executor.uses_ray
...@@ -2,18 +2,22 @@ ...@@ -2,18 +2,22 @@
import pytest import pytest
from vllm.config import LoadFormat
from vllm.entrypoints.llm import LLM from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
import os
from ..utils import models_path_prefix
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
@pytest.mark.parametrize("model",
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
def test_skip_tokenizer_initialization(model: str): def test_skip_tokenizer_initialization(model: str):
# This test checks if the flag skip_tokenizer_init skips the initialization # This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain # of tokenizer and detokenizer. The generated output is expected to contain
# token ids. # token ids.
llm = LLM(model=model, skip_tokenizer_init=True) llm = LLM(model=model,
skip_tokenizer_init=True,
load_format=LoadFormat.RUNAI_STREAMER)
sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True) sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
with pytest.raises(ValueError, match="cannot pass text prompts when"): with pytest.raises(ValueError, match="cannot pass text prompts when"):
......
...@@ -14,7 +14,7 @@ import transformers ...@@ -14,7 +14,7 @@ import transformers
from vllm import SamplingParams from vllm import SamplingParams
from ..utils import models_path_prefix from ..utils import models_path_prefix
MODEL = os.path.join(models_path_prefix, "facebook/opt-350m") MODEL = os.path.join(models_path_prefix, "distilbert/distilgpt2")
STOP_STR = "." STOP_STR = "."
SEED = 42 SEED = 42
MAX_TOKENS = 1024 MAX_TOKENS = 1024
......
...@@ -141,6 +141,47 @@ def sample_definition_json_schema(): ...@@ -141,6 +141,47 @@ def sample_definition_json_schema():
} }
@pytest.fixture
def sample_enum_json_schema():
return {
"type": "object",
"properties": {
"status": {
"type": "string",
"enum": ["active", "inactive",
"pending"] # Literal values using enum
},
"priority": {
"type": "string",
"enum": ["low", "medium", "high", "critical"]
},
"category": {
"type": "object",
"properties": {
"type": {
"type": "string",
"enum": ["bug", "feature", "improvement"]
},
"severity": {
"type": "integer",
"enum": [1, 2, 3, 4,
5] # Enum can also contain numbers
}
},
"required": ["type", "severity"]
},
"flags": {
"type": "array",
"items": {
"type": "string",
"enum": ["urgent", "blocked", "needs_review", "approved"]
}
}
},
"required": ["status", "priority", "category", "flags"]
}
@pytest.fixture @pytest.fixture
def sample_guided_choice(): def sample_guided_choice():
return [ return [
......
...@@ -23,10 +23,13 @@ RTOL = 0.03 ...@@ -23,10 +23,13 @@ RTOL = 0.03
EXPECTED_VALUE = 0.58 EXPECTED_VALUE = 0.58
def run_test(): def run_test(more_args=None):
"""Run the end to end accuracy test.""" """Run the end to end accuracy test."""
model_args = f"pretrained={MODEL_NAME},max_model_len=2048" model_args = f"pretrained={MODEL_NAME},max_model_len=4096"
if more_args is not None:
model_args = "{},{}".format(model_args, more_args)
results = lm_eval.simple_evaluate( results = lm_eval.simple_evaluate(
model="vllm", model="vllm",
...@@ -41,14 +44,21 @@ def run_test(): ...@@ -41,14 +44,21 @@ def run_test():
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}" ), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
@pytest.mark.skipif(not current_platform.is_cuda(), @pytest.mark.skipif(not current_platform.is_cuda()
reason="V1 is currently only supported on CUDA.") and not current_platform.is_tpu(),
reason="V1 is currently only supported on CUDA and TPU")
def test_lm_eval_accuracy_v1_engine(monkeypatch): def test_lm_eval_accuracy_v1_engine(monkeypatch):
"""Run with the V1 Engine.""" """Run with the V1 Engine."""
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1") m.setenv("VLLM_USE_V1", "1")
run_test()
more_args = None
if current_platform.is_tpu():
# Limit compilation time for TPU V1
more_args = "max_num_seqs=64"
run_test(more_args)
def test_lm_eval_accuracy_v0_engine(monkeypatch): def test_lm_eval_accuracy_v0_engine(monkeypatch):
......
...@@ -6,13 +6,18 @@ import os ...@@ -6,13 +6,18 @@ import os
import pytest import pytest
from vllm import LLM from vllm import LLM
from vllm.config import LoadFormat
from ...conftest import MODEL_WEIGHTS_S3_BUCKET
from ..openai.test_vision import TEST_IMAGE_URLS from ..openai.test_vision import TEST_IMAGE_URLS
from ...utils import models_path_prefix from ...utils import models_path_prefix
RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
def test_chat(): def test_chat():
llm = LLM(model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")) llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
load_format=RUNAI_STREAMER_LOAD_FORMAT)
prompt1 = "Explain the concept of entropy." prompt1 = "Explain the concept of entropy."
messages = [ messages = [
...@@ -30,7 +35,8 @@ def test_chat(): ...@@ -30,7 +35,8 @@ def test_chat():
def test_multi_chat(): def test_multi_chat():
llm = LLM(model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")) llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
load_format=RUNAI_STREAMER_LOAD_FORMAT)
prompt1 = "Explain the concept of entropy." prompt1 = "Explain the concept of entropy."
prompt2 = "Explain what among us is." prompt2 = "Explain what among us is."
...@@ -67,7 +73,8 @@ def test_multi_chat(): ...@@ -67,7 +73,8 @@ def test_multi_chat():
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]]) [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
def test_chat_multi_image(image_urls: List[str]): def test_chat_multi_image(image_urls: List[str]):
llm = LLM( llm = LLM(
model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"), model=f"{MODEL_WEIGHTS_S3_BUCKET}/Phi-3.5-vision-instruct",
load_format=RUNAI_STREAMER_LOAD_FORMAT,
dtype="bfloat16", dtype="bfloat16",
max_model_len=4096, max_model_len=4096,
max_num_seqs=5, max_num_seqs=5,
......
...@@ -28,7 +28,7 @@ def test_collective_rpc(tp_size, backend): ...@@ -28,7 +28,7 @@ def test_collective_rpc(tp_size, backend):
def echo_rank(self): def echo_rank(self):
return self.rank return self.rank
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", llm = LLM(model="s3://vllm-ci-model-weights/Llama-3.2-1B-Instruct",
enforce_eager=True, enforce_eager=True,
load_format="dummy", load_format="dummy",
tensor_parallel_size=tp_size, tensor_parallel_size=tp_size,
......
...@@ -7,10 +7,11 @@ import pytest ...@@ -7,10 +7,11 @@ import pytest
import os import os
from vllm import LLM, PoolingParams, PoolingRequestOutput from vllm import LLM, PoolingParams, PoolingRequestOutput
from vllm.config import LoadFormat
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
from ...utils import models_path_prefix from ...utils import models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct") MODEL_NAME = os.path.join(models_path_prefix, "e5-mistral-7b-instruct")
PROMPTS = [ PROMPTS = [
"Hello, my name is", "Hello, my name is",
...@@ -34,6 +35,7 @@ def llm(): ...@@ -34,6 +35,7 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to # pytest caches the fixture so we use weakref.proxy to
# enable garbage collection # enable garbage collection
llm = LLM(model=MODEL_NAME, llm = LLM(model=MODEL_NAME,
load_format=LoadFormat.RUNAI_STREAMER,
max_num_batched_tokens=32768, max_num_batched_tokens=32768,
tensor_parallel_size=1, tensor_parallel_size=1,
gpu_memory_utilization=0.75, gpu_memory_utilization=0.75,
......
...@@ -7,10 +7,11 @@ import os ...@@ -7,10 +7,11 @@ import os
import pytest import pytest
from vllm import LLM, RequestOutput, SamplingParams from vllm import LLM, RequestOutput, SamplingParams
from vllm.config import LoadFormat
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
from ...utils import models_path_prefix from ...utils import models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "facebook/opt-125m") MODEL_NAME = os.path.join(models_path_prefix, "distilgpt2")
PROMPTS = [ PROMPTS = [
"Hello, my name is", "Hello, my name is",
...@@ -32,6 +33,7 @@ def llm(): ...@@ -32,6 +33,7 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to # pytest caches the fixture so we use weakref.proxy to
# enable garbage collection # enable garbage collection
llm = LLM(model=MODEL_NAME, llm = LLM(model=MODEL_NAME,
load_format=LoadFormat.RUNAI_STREAMER,
max_num_batched_tokens=4096, max_num_batched_tokens=4096,
tensor_parallel_size=1, tensor_parallel_size=1,
gpu_memory_utilization=0.10, gpu_memory_utilization=0.10,
......
...@@ -8,11 +8,12 @@ import os ...@@ -8,11 +8,12 @@ import os
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from vllm import LLM from vllm import LLM
from vllm.config import LoadFormat
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from ...utils import models_path_prefix from ...utils import models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta") MODEL_NAME = os.path.join(models_path_prefix, "zephyr-7b-beta")
PROMPTS = [ PROMPTS = [
"Hello, my name is", "Hello, my name is",
...@@ -29,6 +30,7 @@ def llm(): ...@@ -29,6 +30,7 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to # pytest caches the fixture so we use weakref.proxy to
# enable garbage collection # enable garbage collection
llm = LLM(model=MODEL_NAME, llm = LLM(model=MODEL_NAME,
load_format=LoadFormat.RUNAI_STREAMER,
tensor_parallel_size=1, tensor_parallel_size=1,
max_model_len=8192, max_model_len=8192,
enable_lora=True, enable_lora=True,
......
...@@ -8,6 +8,7 @@ import jsonschema ...@@ -8,6 +8,7 @@ import jsonschema
import pytest import pytest
import os import os
from vllm.config import LoadFormat
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
from vllm.entrypoints.llm import LLM from vllm.entrypoints.llm import LLM
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
...@@ -15,7 +16,7 @@ from vllm.outputs import RequestOutput ...@@ -15,7 +16,7 @@ from vllm.outputs import RequestOutput
from vllm.sampling_params import GuidedDecodingParams, SamplingParams from vllm.sampling_params import GuidedDecodingParams, SamplingParams
from ...utils import models_path_prefix from ...utils import models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2.5-7B-Instruct") MODEL_NAME = os.path.join(models_path_prefix, "Qwen2.5-1.5B-Instruct")
GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"] GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
...@@ -23,7 +24,9 @@ GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"] ...@@ -23,7 +24,9 @@ GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
def llm(): def llm():
# pytest caches the fixture so we use weakref.proxy to # pytest caches the fixture so we use weakref.proxy to
# enable garbage collection # enable garbage collection
llm = LLM(model=MODEL_NAME, max_model_len=1024) llm = LLM(model=MODEL_NAME,
load_format=LoadFormat.RUNAI_STREAMER,
max_model_len=1024)
with llm.deprecate_legacy_api(): with llm.deprecate_legacy_api():
yield weakref.proxy(llm) yield weakref.proxy(llm)
...@@ -149,6 +152,47 @@ def test_guided_definition_json_completion(sample_definition_json_schema, llm, ...@@ -149,6 +152,47 @@ def test_guided_definition_json_completion(sample_definition_json_schema, llm,
schema=sample_definition_json_schema) schema=sample_definition_json_schema)
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
def test_guided_enum_json_completion(sample_enum_json_schema, llm,
guided_decoding_backend: str):
sampling_params = SamplingParams(temperature=1.0,
max_tokens=1000,
guided_decoding=GuidedDecodingParams(
json=sample_enum_json_schema,
backend=guided_decoding_backend))
outputs = llm.generate(prompts=[
"Create a bug report JSON that fits this schema: "
f"{sample_enum_json_schema}. Make it for a high priority critical bug."
] * 2,
sampling_params=sampling_params,
use_tqdm=True)
assert outputs is not None
for output in outputs:
assert output is not None
assert isinstance(output, RequestOutput)
prompt = output.prompt
generated_text = output.outputs[0].text
assert generated_text is not None
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
output_json = json.loads(generated_text)
jsonschema.validate(instance=output_json,
schema=sample_enum_json_schema)
# Additional assertions to verify enum values
assert output_json["status"] in ["active", "inactive", "pending"]
assert output_json["priority"] in ["low", "medium", "high", "critical"]
assert output_json["category"]["type"] in [
"bug", "feature", "improvement"
]
assert output_json["category"]["severity"] in [1, 2, 3, 4, 5]
for flag in output_json["flags"]:
assert flag in ["urgent", "blocked", "needs_review", "approved"]
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
def test_guided_choice_completion(sample_guided_choice, llm, def test_guided_choice_completion(sample_guided_choice, llm,
......
...@@ -7,11 +7,12 @@ from contextlib import nullcontext ...@@ -7,11 +7,12 @@ from contextlib import nullcontext
from vllm_test_utils import BlameResult, blame from vllm_test_utils import BlameResult, blame
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.config import LoadFormat
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
from ...utils import models_path_prefix from ...utils import models_path_prefix
def run_normal(): def run_normal_opt125m():
prompts = [ prompts = [
"Hello, my name is", "Hello, my name is",
"The president of the United States is", "The president of the United States is",
...@@ -35,9 +36,35 @@ def run_normal(): ...@@ -35,9 +36,35 @@ def run_normal():
cleanup_dist_env_and_memory() cleanup_dist_env_and_memory()
def run_normal():
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM without guided decoding as a baseline.
llm = LLM(model="s3://vllm-ci-model-weights/distilgpt2",
load_format=LoadFormat.RUNAI_STREAMER,
enforce_eager=True,
gpu_memory_utilization=0.3)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
# Destroy the LLM object and free up the GPU memory.
del llm
cleanup_dist_env_and_memory()
def run_lmfe(sample_regex): def run_lmfe(sample_regex):
# Create an LLM with guided decoding enabled. # Create an LLM with guided decoding enabled.
llm = LLM(model=os.path.join(models_path_prefix, "facebook/opt-125m"), llm = LLM(model=os.path.join(models_path_prefix, "distilgpt2"),
load_format=LoadFormat.RUNAI_STREAMER,
enforce_eager=True, enforce_eager=True,
guided_decoding_backend="lm-format-enforcer", guided_decoding_backend="lm-format-enforcer",
gpu_memory_utilization=0.3) gpu_memory_utilization=0.3)
......
...@@ -5,6 +5,7 @@ import os ...@@ -5,6 +5,7 @@ import os
from vllm import LLM from vllm import LLM
from ...utils import models_path_prefix from ...utils import models_path_prefix
from vllm.config import LoadFormat
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
...@@ -16,13 +17,17 @@ def v1(run_with_both_engines): ...@@ -16,13 +17,17 @@ def v1(run_with_both_engines):
def test_empty_prompt(): def test_empty_prompt():
llm = LLM(model=os.path.join(models_path_prefix, "gpt2"), enforce_eager=True) llm = LLM(model=os.path.join(models_path_prefix, "gpt2"),
load_format=LoadFormat.RUNAI_STREAMER,
enforce_eager=True)
with pytest.raises(ValueError, match='Prompt cannot be empty'): with pytest.raises(ValueError, match='Prompt cannot be empty'):
llm.generate([""]) llm.generate([""])
@pytest.mark.skip_v1 @pytest.mark.skip_v1
def test_out_of_vocab_token(): def test_out_of_vocab_token():
llm = LLM(model=os.path.join(models_path_prefix, "gpt2"), enforce_eager=True) llm = LLM(model=os.path.join(models_path_prefix, "gpt2"),
load_format=LoadFormat.RUNAI_STREAMER,
enforce_eager=True)
with pytest.raises(ValueError, match='out of vocabulary'): with pytest.raises(ValueError, match='out of vocabulary'):
llm.generate({"prompt_token_ids": [999999]}) llm.generate({"prompt_token_ids": [999999]})
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment