Commit ad385667 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.6.3.post1-dev'

parents be0967c1 903593d3
from typing import List
import pytest # noqa
from vllm.config import CacheConfig, SchedulerConfig
from vllm.core.scheduler import Scheduler
from vllm.sequence import SequenceGroup
from .utils import (append_new_token, create_dummy_prompt_encoder_decoder,
get_sequence_groups, schedule_and_update_computed_tokens)
def test_scheduler_schedule_simple_encoder_decoder():
'''
Test basic scheduler functionality in the context
of an encoder/decoder model. Focus on testing
enc/dec-specific functionality sense tests already
exist for decoder-only functionality
Test behavior:
* Construct Scheduler
* Construct dummy encoder/decoder sequence groups
* Add dummy seq groups to scheduler backlog
* Schedule the next seq group & validate:
* Cross-attn block tables
* Updated states of seq groups
* Number of batched tokens
* Number of blocks to copy/swap-in/swap-out
* Number of scheduled seq groups
* Repeat for both prefill- and decode-phase
* Abort scheduled seq groups
* Assert that aborted seq groups no longer appear in
cross-attention block table
'''
block_size = 4
num_seq_group = 4
max_model_len = 16
scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 16 # enc and dec prompts per seq_group
cache_config.num_gpu_blocks = 16 # enc and dec prompts per seq_group
scheduler = Scheduler(scheduler_config, cache_config, None)
running: List[SequenceGroup] = []
# Add seq groups to scheduler.
req_id_list = []
for i in range(num_seq_group):
req_id = str(i)
req_id_list.append(req_id)
_, _, seq_group = create_dummy_prompt_encoder_decoder(
req_id, block_size, block_size, block_size)
scheduler.add_seq_group(seq_group)
running.append(seq_group)
# Schedule seq groups prefill.
num_tokens = block_size * num_seq_group
seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler)
# - Verify that sequence group cross-attention block tables are
# registered with the block manager
assert all([(req_id in scheduler.block_manager.cross_block_tables)
for req_id in req_id_list])
# - Validate sequence-group status
assert set(get_sequence_groups(out)) == set(running)
# - Validate number of batched tokens
assert out.num_batched_tokens == num_tokens
# - Validate there are no remaining blocks to swap
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
and not out.blocks_to_swap_out)
# - Validate all seq groups were scheduled
assert len(seq_group_meta_list) == num_seq_group
append_new_token(out, 1)
# Schedule seq groups decode.
seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler)
# - Verify that sequence group metadata includes encoder attention
# and cross-attention metadata
assert all([
not ((seq_group_meta.encoder_seq_data is None) or
(seq_group_meta.cross_block_table is None))
for seq_group_meta in seq_group_meta_list
])
# - Validate sequence-group status
assert set(get_sequence_groups(out)) == set(running)
# - Validate there is one batched token per seq group
assert out.num_batched_tokens == num_seq_group
# - Validate there are no remaining blocks to swap
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
and not out.blocks_to_swap_out)
# - Validate that all seq groups were scheduled
assert len(seq_group_meta_list) == num_seq_group
append_new_token(out, 1)
# Abort sequences
for req_id in req_id_list:
scheduler.abort_seq_group(req_id)
# - Verify that sequence group cross-attention block tables are
# NO LONGER registered with the block manager
assert req_id not in scheduler.block_manager.cross_block_tables
import msgspec
from vllm.executor.msgspec_utils import decode_hook, encode_hook
from vllm.sequence import ExecuteModelRequest
from ..spec_decode.utils import create_batch
def test_msgspec_serialization():
num_lookahead_slots = 4
seq_group_metadata_list, _, _ = create_batch(16, num_lookahead_slots)
execute_model_req = ExecuteModelRequest(
seq_group_metadata_list=seq_group_metadata_list,
num_lookahead_slots=num_lookahead_slots,
running_queue_size=4)
encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
decoder = msgspec.msgpack.Decoder(ExecuteModelRequest,
dec_hook=decode_hook)
req = decoder.decode(encoder.encode(execute_model_req))
expected = execute_model_req.seq_group_metadata_list
actual = req.seq_group_metadata_list
assert (len(expected) == len(actual))
expected = expected[0]
actual = actual[0]
assert expected.block_tables == actual.block_tables
assert expected.is_prompt == actual.is_prompt
assert expected.request_id == actual.request_id
assert (expected.seq_data[0].prompt_token_ids ==
actual.seq_data[0].prompt_token_ids)
assert (expected.seq_data[0].output_token_ids ==
actual.seq_data[0].output_token_ids)
...@@ -13,15 +13,18 @@ def create_dummy_prompt( ...@@ -13,15 +13,18 @@ def create_dummy_prompt(
prompt_length: int, prompt_length: int,
block_size: Optional[int] = None, block_size: Optional[int] = None,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
use_beam_search: bool = False,
best_of: int = 1, best_of: int = 1,
prompt_tokens: Optional[List[int]] = None,
min_tokens: int = 0,
max_tokens: int = 16,
) -> Tuple[Sequence, SequenceGroup]: ) -> Tuple[Sequence, SequenceGroup]:
if not block_size: if not block_size:
block_size = prompt_length block_size = prompt_length
# Create dummy prompt sequence with tokens 0...block_size-1 if prompt_tokens is None:
# and prompt "0 ... block_size". # Create dummy prompt sequence with tokens 0...block_size-1
prompt_tokens = list(range(prompt_length)) # and prompt "0 ... block_size".
prompt_tokens = list(range(prompt_length))
prompt_str = " ".join([str(t) for t in prompt_tokens]) prompt_str = " ".join([str(t) for t in prompt_tokens])
prompt = Sequence(int(request_id), prompt = Sequence(int(request_id),
inputs={ inputs={
...@@ -33,8 +36,9 @@ def create_dummy_prompt( ...@@ -33,8 +36,9 @@ def create_dummy_prompt(
seqs=[prompt], seqs=[prompt],
arrival_time=time.time(), arrival_time=time.time(),
sampling_params=SamplingParams( sampling_params=SamplingParams(
use_beam_search=use_beam_search, best_of=best_of,
best_of=best_of), max_tokens=max_tokens,
min_tokens=min_tokens),
lora_request=lora_request) lora_request=lora_request)
return prompt, seq_group return prompt, seq_group
...@@ -46,39 +50,39 @@ def create_dummy_prompt_encoder_decoder( ...@@ -46,39 +50,39 @@ def create_dummy_prompt_encoder_decoder(
encoder_prompt_length: int, encoder_prompt_length: int,
block_size: Optional[int] = None, block_size: Optional[int] = None,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
use_beam_search: bool = False,
best_of: int = 1, best_of: int = 1,
) -> Tuple[Sequence, Sequence, SequenceGroup]: ) -> Tuple[Sequence, Sequence, SequenceGroup]:
if not block_size: if not block_size:
block_size = decoder_prompt_length block_size = decoder_prompt_length
# Create dummy prompt sequence with tokens 0...block_size-1 # Create dummy prompt sequence with tokens 0...block_size-1
# and prompt "0 ... block_size". # and prompt "0 ... block_size". Note that the prompt string
# doesn't actually match the tokens
decoder_prompt_tokens = list(range(decoder_prompt_length)) decoder_prompt_tokens = list(range(decoder_prompt_length))
decoder_prompt_str = " ".join([str(t) for t in decoder_prompt_tokens]) decoder_prompt_str = " ".join([str(t) for t in decoder_prompt_tokens])
encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length))))
encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens])
inputs = {
"prompt": decoder_prompt_str,
"prompt_token_ids": decoder_prompt_tokens,
"encoder_prompt": encoder_prompt_str,
"encoder_prompt_token_ids": encoder_prompt_tokens,
"multi_modal_data": None,
}
decoder_prompt = Sequence(int(request_id), decoder_prompt = Sequence(int(request_id),
inputs={ inputs=inputs,
"prompt": decoder_prompt_str, block_size=block_size,
"prompt_token_ids": decoder_prompt_tokens, from_decoder_prompt=True)
"multi_modal_data": None,
},
block_size=block_size)
encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length))))
encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens])
encoder_prompt = Sequence(int(request_id), encoder_prompt = Sequence(int(request_id),
inputs={ inputs=inputs,
"prompt": encoder_prompt_str, block_size=block_size,
"prompt_token_ids": encoder_prompt_tokens, from_decoder_prompt=False)
"multi_modal_data": None,
},
block_size=block_size)
seq_group = SequenceGroup(request_id=request_id, seq_group = SequenceGroup(request_id=request_id,
seqs=[decoder_prompt], seqs=[decoder_prompt],
sampling_params=SamplingParams( sampling_params=SamplingParams(best_of=best_of),
use_beam_search=use_beam_search,
best_of=best_of),
arrival_time=time.time(), arrival_time=time.time(),
lora_request=lora_request, lora_request=lora_request,
encoder_seq=encoder_prompt) encoder_seq=encoder_prompt)
...@@ -139,17 +143,21 @@ def create_seq_group_encoder_decoder( ...@@ -139,17 +143,21 @@ def create_seq_group_encoder_decoder(
prompt_token_ids = [0] * seq_prompt_len prompt_token_ids = [0] * seq_prompt_len
inputs = {
"prompt": "",
"prompt_token_ids": prompt_token_ids,
"encoder_prompt": "",
"encoder_prompt_token_ids": prompt_token_ids,
"multi_modal_data": None,
}
seqs = [] seqs = []
for seq_id_offset, output_len in enumerate(seq_output_lens): for seq_id_offset, output_len in enumerate(seq_output_lens):
seq = Sequence( # Construct decoder input sequences
seq_id=seq_id_start + seq_id_offset, seq = Sequence(seq_id=seq_id_start + seq_id_offset,
inputs={ inputs=inputs,
"prompt": "", block_size=16,
"prompt_token_ids": prompt_token_ids, from_decoder_prompt=True)
"multi_modal_data": None,
},
block_size=16,
)
for i in range(output_len): for i in range(output_len):
seq.append_token_id( seq.append_token_id(
...@@ -158,16 +166,11 @@ def create_seq_group_encoder_decoder( ...@@ -158,16 +166,11 @@ def create_seq_group_encoder_decoder(
) )
seqs.append(seq) seqs.append(seq)
# Encoder sequence # Encoder input sequence
encoder_seq = Sequence( encoder_seq = Sequence(seq_id=seq_id_start + len(seq_output_lens),
seq_id=seq_id_start + len(seq_output_lens), inputs=inputs,
inputs={ block_size=16,
"prompt": "", from_decoder_prompt=False)
"prompt_token_ids": prompt_token_ids,
"multi_modal_data": None,
},
block_size=16,
)
return SequenceGroup(request_id=request_id, return SequenceGroup(request_id=request_id,
seqs=seqs, seqs=seqs,
...@@ -177,4 +180,31 @@ def create_seq_group_encoder_decoder( ...@@ -177,4 +180,31 @@ def create_seq_group_encoder_decoder(
def round_up_to_next_block(seq_len: int, block_size: int) -> int: def round_up_to_next_block(seq_len: int, block_size: int) -> int:
return (seq_len + block_size - 1) // block_size return (seq_len + block_size - 1) // block_size
\ No newline at end of file
# Helper functions for scheduler tests
def get_sequence_groups(scheduler_output):
return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
def append_new_token(out, token_id: int):
seq_groups = get_sequence_groups(out)
for seq_group in seq_groups:
for seq in seq_group.get_seqs():
seq.append_token_id(token_id, {token_id: Logprob(token_id)})
def schedule_and_update_computed_tokens(scheduler):
metas, out, _ = scheduler.schedule()
for s, meta in zip(out.scheduled_seq_groups, metas):
s.seq_group.update_num_computed_tokens(meta.token_chunk_size)
return metas, out
def append_new_token_seq_group(token_chunk_size, seq_group, token_id: int):
seq_group.update_num_computed_tokens(token_chunk_size)
for seq in seq_group.get_seqs():
seq.append_token_id(token_id, {token_id: Logprob(token_id)})
port: 12312
served_model_name: mymodel
tensor_parallel_size: 2
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
Run:
```sh
cd $VLLM_PATH/tests
pytest distributed/test_basic_distributed_correctness.py
```
"""
import os
import pytest
from vllm.utils import cuda_device_count_stateless
from ..models.utils import check_outputs_equal
from ..utils import fork_new_process_for_each_test
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize(
"model, distributed_executor_backend, attention_backend, test_suite", [
("facebook/opt-125m", "ray", "", "L4"),
("facebook/opt-125m", "mp", "", "L4"),
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
("facebook/opt-125m", "ray", "", "A100"),
("facebook/opt-125m", "mp", "", "A100"),
("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
])
@fork_new_process_for_each_test
def test_models(
hf_runner,
vllm_runner,
example_prompts,
model: str,
distributed_executor_backend: str,
attention_backend: str,
test_suite: str,
) -> None:
if test_suite != TARGET_TEST_SUITE:
pytest.skip(f"Skip test for {test_suite}")
if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
# test ray adag
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
if attention_backend:
os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
dtype = "half"
max_tokens = 5
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(model,
dtype=dtype,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
Run:
```sh
pytest test_chunked_prefill_distributed.py
```
"""
import pytest
from vllm.utils import cuda_device_count_stateless
from ..models.utils import check_outputs_equal
from ..utils import fork_new_process_for_each_test
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize("model, distributed_executor_backend", [
("facebook/opt-125m", "ray"),
("meta-llama/Llama-2-7b-hf", "ray"),
("facebook/opt-125m", "mp"),
("meta-llama/Llama-2-7b-hf", "mp"),
])
@fork_new_process_for_each_test
def test_models(
hf_runner,
vllm_runner,
example_prompts,
model: str,
distributed_executor_backend: str,
) -> None:
dtype = "half"
max_tokens = 5
chunked_prefill_token_size = 16
# Add a chunked prefill config.
max_num_seqs = min(chunked_prefill_token_size, 256)
assert chunked_prefill_token_size != -1
enable_chunked_prefill = True
max_num_batched_tokens = chunked_prefill_token_size
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(
model,
dtype=dtype,
tensor_parallel_size=2,
max_num_seqs=max_num_seqs,
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
...@@ -34,7 +34,7 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int, ...@@ -34,7 +34,7 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0) expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
t = all_tensors[rank % tp_size] t = all_tensors[rank % tp_size]
t = tensor_model_parallel_all_reduce(t) t = tensor_model_parallel_all_reduce(t)
assert torch.allclose(t, expected) torch.testing.assert_close(t, expected)
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
...@@ -62,7 +62,7 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int, ...@@ -62,7 +62,7 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
expected = torch.cat(all_tensors, dim=all_gather_dimension) expected = torch.cat(all_tensors, dim=all_gather_dimension)
t = all_tensors[rank % tp_size] t = all_tensors[rank % tp_size]
t = tensor_model_parallel_all_gather(t, all_gather_dimension) t = tensor_model_parallel_all_gather(t, all_gather_dimension)
assert torch.allclose(t, expected) torch.testing.assert_close(t, expected)
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
...@@ -96,12 +96,12 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, ...@@ -96,12 +96,12 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
else: else:
recv_dict = broadcast_tensor_dict(src=0) recv_dict = broadcast_tensor_dict(src=0)
assert len(recv_dict) == len(test_dict) assert len(recv_dict) == len(test_dict)
assert torch.allclose(recv_dict["a"], test_dict["a"]) torch.testing.assert_close(recv_dict["a"], test_dict["a"])
assert torch.allclose(recv_dict["b"], test_dict["b"]) torch.testing.assert_close(recv_dict["b"], test_dict["b"])
assert recv_dict["c"] == test_dict["c"] assert recv_dict["c"] == test_dict["c"]
assert recv_dict["d"] == test_dict["d"] assert recv_dict["d"] == test_dict["d"]
assert recv_dict["e"] == test_dict["e"] assert recv_dict["e"] == test_dict["e"]
assert torch.allclose(recv_dict["f"], test_dict["f"]) torch.testing.assert_close(recv_dict["f"], test_dict["f"])
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
...@@ -136,12 +136,12 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, ...@@ -136,12 +136,12 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
if not get_pp_group().is_first_rank: if not get_pp_group().is_first_rank:
assert len(recv_dict) == len(test_dict) assert len(recv_dict) == len(test_dict)
assert torch.allclose(recv_dict["a"], test_dict["a"]) torch.testing.assert_close(recv_dict["a"], test_dict["a"])
assert torch.allclose(recv_dict["b"], test_dict["b"]) torch.testing.assert_close(recv_dict["b"], test_dict["b"])
assert recv_dict["c"] == test_dict["c"] assert recv_dict["c"] == test_dict["c"]
assert recv_dict["d"] == test_dict["d"] assert recv_dict["d"] == test_dict["d"]
assert recv_dict["e"] == test_dict["e"] assert recv_dict["e"] == test_dict["e"]
assert torch.allclose(recv_dict["f"], test_dict["f"]) torch.testing.assert_close(recv_dict["f"], test_dict["f"])
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
...@@ -163,7 +163,7 @@ def send_recv_test_worker(tp_size: int, pp_size: int, rank: int, ...@@ -163,7 +163,7 @@ def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
get_pp_group().send(test_tensor) get_pp_group().send(test_tensor)
if not get_pp_group().is_first_rank: if not get_pp_group().is_first_rank:
assert torch.allclose(test_tensor, recv_tensor) torch.testing.assert_close(test_tensor, recv_tensor)
@pytest.mark.skipif(torch.cuda.device_count() < 2, @pytest.mark.skipif(torch.cuda.device_count() < 2,
......
...@@ -72,8 +72,8 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port): ...@@ -72,8 +72,8 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
out2 = tensor_model_parallel_all_reduce(inp2) out2 = tensor_model_parallel_all_reduce(inp2)
dist.all_reduce(inp2, group=group) dist.all_reduce(inp2, group=group)
graph.replay() graph.replay()
assert torch.allclose(out1, inp1) torch.testing.assert_close(out1, inp1)
assert torch.allclose(out2, inp2) torch.testing.assert_close(out2, inp2)
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
...@@ -96,13 +96,13 @@ def eager_allreduce(tp_size, pp_size, rank, distributed_init_port): ...@@ -96,13 +96,13 @@ def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
out = inp out = inp
for _ in range(num_communication): for _ in range(num_communication):
out = fa.all_reduce_unreg(out) out = fa.all_reduce_unreg(out)
assert torch.allclose(out, inp * (tp_size**num_communication)) torch.testing.assert_close(out, inp * (tp_size**num_communication))
inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device) inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
out = inp out = inp
for _ in range(num_communication): for _ in range(num_communication):
out = fa.all_reduce_unreg(out) out = fa.all_reduce_unreg(out)
assert torch.allclose(out, inp * (tp_size**num_communication)) torch.testing.assert_close(out, inp * (tp_size**num_communication))
@pytest.mark.parametrize("tp_size", [2]) @pytest.mark.parametrize("tp_size", [2])
......
from ..entrypoints.openai.test_oot_registration import (
run_and_test_dummy_opt_api_server)
def test_distributed_oot(dummy_opt_path: str):
run_and_test_dummy_opt_api_server(dummy_opt_path, tp=2)
"""Make sure ray assigns GPU workers to the correct node.
Run:
```sh
cd $VLLM_PATH/tests
pytest distributed/test_multi_node_assignment.py
```
"""
import os
import pytest
import ray
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
from vllm import initialize_ray_cluster
from vllm.config import ParallelConfig
from vllm.executor.ray_utils import _wait_until_pg_removed
from vllm.utils import get_ip
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
@pytest.mark.skipif(not VLLM_MULTI_NODE,
reason="Need at least 2 nodes to run the test.")
def test_multi_node_assignment() -> None:
# NOTE: important to keep this class definition here
# to let ray use cloudpickle to serialize it.
class Actor:
def get_ip(self):
return get_ip()
for _ in range(10):
config = ParallelConfig(1, 2)
initialize_ray_cluster(config)
current_ip = get_ip()
workers = []
for bundle_id, bundle in enumerate(
config.placement_group.bundle_specs):
if not bundle.get("GPU", 0):
continue
scheduling_strategy = PlacementGroupSchedulingStrategy(
placement_group=config.placement_group,
placement_group_capture_child_tasks=True,
placement_group_bundle_index=bundle_id,
)
worker = ray.remote(
num_cpus=0,
num_gpus=1,
scheduling_strategy=scheduling_strategy,
)(Actor).remote()
worker_ip = ray.get(worker.get_ip.remote())
assert worker_ip == current_ip
workers.append(worker)
for worker in workers:
ray.kill(worker)
_wait_until_pg_removed(config.placement_group)
...@@ -6,47 +6,267 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node ...@@ -6,47 +6,267 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
to fail. to fail.
""" """
import os import os
from dataclasses import dataclass
from typing import List, Literal, NamedTuple, Optional
import pytest import pytest
from vllm.logger import init_logger
from ..utils import compare_two_settings, fork_new_process_for_each_test from ..utils import compare_two_settings, fork_new_process_for_each_test
logger = init_logger("test_pipeline_parallel")
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1" VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
@pytest.mark.parametrize(("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, " class ParallelSetup(NamedTuple):
"MODEL_NAME, DIST_BACKEND"), tp_size: int
[ pp_size: int
(2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"), eager_mode: bool
(2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"), chunked_prefill: bool
(1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
(1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
(1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"), @dataclass
(2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"), class PPTestSettings:
(2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"), parallel_setups: List[ParallelSetup]
(1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"), distributed_backends: List[str]
(1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"), trust_remote_code: bool
(1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"), tokenizer_mode: Optional[str]
])
def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME, @staticmethod
DIST_BACKEND): def detailed(
if VLLM_MULTI_NODE and DIST_BACKEND == "mp": *,
tp_base: int = 1,
pp_base: int = 2,
trust_remote_code: bool = False,
tokenizer_mode: Optional[str] = None,
):
return PPTestSettings(
parallel_setups=[
ParallelSetup(tp_size=tp_base,
pp_size=pp_base,
eager_mode=False,
chunked_prefill=False),
ParallelSetup(tp_size=tp_base,
pp_size=2 * pp_base,
eager_mode=False,
chunked_prefill=True),
ParallelSetup(tp_size=tp_base,
pp_size=2 * pp_base,
eager_mode=True,
chunked_prefill=False),
ParallelSetup(tp_size=2 * tp_base,
pp_size=pp_base,
eager_mode=False,
chunked_prefill=True),
ParallelSetup(tp_size=2 * tp_base,
pp_size=pp_base,
eager_mode=True,
chunked_prefill=False),
],
distributed_backends=["mp", "ray"],
trust_remote_code=trust_remote_code,
tokenizer_mode=tokenizer_mode,
)
@staticmethod
def fast(
*,
tp_base: int = 1,
pp_base: int = 2,
trust_remote_code: bool = False,
tokenizer_mode: Optional[str] = None,
):
return PPTestSettings(
parallel_setups=[
ParallelSetup(tp_size=tp_base,
pp_size=pp_base,
eager_mode=True,
chunked_prefill=False),
],
distributed_backends=["mp"],
trust_remote_code=trust_remote_code,
tokenizer_mode=tokenizer_mode,
)
def iter_params(self, model_name: str):
for parallel_setup in self.parallel_setups:
for distributed_backend in self.distributed_backends:
yield (model_name, parallel_setup, distributed_backend,
self.trust_remote_code, self.tokenizer_mode)
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
# The values displayed here are only a rough indicator of the size of the model
# yapf: disable
GENERATION_MODEL_SETTINGS = {
# [DETAILED TESTS]
"meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
# [FAST TESTS]
# Uses Llama
# "BAAI/AquilaChat-7B": PPTestSettings.fast(),
"Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(tp_base=8, trust_remote_code=True), # noqa: E501
"baichuan-inc/Baichuan-7B": PPTestSettings.fast(trust_remote_code=True),
"baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
"bigscience/bloomz-1b1": PPTestSettings.fast(),
"THUDM/chatglm3-6b": PPTestSettings.fast(trust_remote_code=True),
"CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(tp_base=2, trust_remote_code=True), # noqa: E501
"databricks/dbrx-instruct": PPTestSettings.fast(tp_base=8),
"Deci/DeciLM-7B-instruct": PPTestSettings.fast(trust_remote_code=True),
"deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
"deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
"tiiuae/falcon-7b": PPTestSettings.fast(),
"google/gemma-2b": PPTestSettings.fast(),
"google/gemma-2-9b": PPTestSettings.fast(),
"gpt2": PPTestSettings.fast(),
"bigcode/starcoder": PPTestSettings.fast(),
"EleutherAI/gpt-j-6b": PPTestSettings.fast(),
"EleutherAI/pythia-12b": PPTestSettings.fast(),
"ibm/PowerLM-3b": PPTestSettings.fast(),
"ibm/PowerMoE-3b": PPTestSettings.fast(),
# Uses Llama
# "internlm/internlm-chat-7b": PPTestSettings.fast(),
"internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True),
"core42/jais-13b-chat": PPTestSettings.fast(),
# TODO: Implement PP
# "ai21labs/AI21-Jamba-1.5-Mini": PPTestSettings.fast(),
"openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True),
"openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True),
# Uses Llama
# "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
"mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4),
"mosaicml/mpt-7b": PPTestSettings.fast(),
"nvidia/Minitron-8B-Base": PPTestSettings.fast(),
"allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
"allenai/OLMo-1B-hf": PPTestSettings.fast(),
"facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
"OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
"microsoft/phi-2": PPTestSettings.fast(),
"microsoft/Phi-3-mini-4k-instruct": PPTestSettings.fast(),
"microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
# FIXME: https://github.com/vllm-project/vllm/issues/8553
# "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
"adept/persimmon-8b-chat": PPTestSettings.fast(),
"Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
"Qwen/Qwen2-beta-7B-Chat": PPTestSettings.fast(),
"Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
"stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
"bigcode/starcoder2-3b": PPTestSettings.fast(),
"upstage/solar-pro-preview-instruct": PPTestSettings.fast(tp_base=2),
# FIXME: Cannot load tokenizer in latest transformers version
# "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
}
EMBEDDING_MODEL_SETTINGS = { # type: ignore[var-annotated]
# [FAST TESTS]
"intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
"Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(tp_base=4, trust_remote_code=True), # noqa: E501
}
MULTIMODAL_MODEL_SETTINGS = {
# [FAST TESTS]
"Salesforce/blip2-opt-2.7b": PPTestSettings.fast(),
"facebook/chameleon-7b": PPTestSettings.fast(),
"adept/fuyu-8b": PPTestSettings.fast(),
"OpenGVLab/InternVL2-1B": PPTestSettings.fast(trust_remote_code=True),
"llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
"llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
"llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
"openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(trust_remote_code=True),
# TODO: Implement PP
# "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
"microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
"mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"), # noqa: E501
"Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
"Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
"fixie-ai/ultravox-v0_3": PPTestSettings.fast(),
}
CONDITIONAL_GENERATION_MODEL_SETTINGS = { # type: ignore[var-annotated]
# [FAST TESTS]
# TODO: Implement PP
# "facebook/bart-base": PPTestSettings.fast(),
}
# yapf: enable
# NOTE: You can update this on your local machine to run specific tests
TEST_MODELS = [
# [LANGUAGE GENERATION]
"meta-llama/Meta-Llama-3-8B",
"ibm/PowerLM-3b",
# [LANGUAGE EMBEDDING]
"intfloat/e5-mistral-7b-instruct",
"BAAI/bge-multilingual-gemma2",
# [MULTIMODAL GENERATION]
"OpenGVLab/InternVL2-1B",
"microsoft/Phi-3-vision-128k-instruct",
"fixie-ai/ultravox-v0_3",
]
def _compare_tp(
model_name: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
trust_remote_code: bool,
tokenizer_mode: Optional[str],
num_gpus_available: int,
*,
method: Literal["generate", "encode"] = "encode",
):
tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup
if num_gpus_available < tp_size * pp_size:
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
if VLLM_MULTI_NODE and distributed_backend == "mp":
pytest.skip("Skipping multi-node pipeline parallel test for " pytest.skip("Skipping multi-node pipeline parallel test for "
"multiprocessing distributed backend") "multiprocessing distributed backend")
USE_RAY_ADAG_NCCL = 0 common_args = [
USE_RAY_ADAG = 0
pp_args = [
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",
"float16", "float16",
"--max-model-len",
"2048",
"--max-num-seqs",
"8",
]
if chunked_prefill:
common_args.append("--enable-chunked-prefill")
if eager_mode:
common_args.append("--enforce-eager")
if trust_remote_code:
common_args.append("--trust-remote-code")
if tokenizer_mode:
common_args.extend(["--tokenizer-mode", tokenizer_mode])
if (distributed_backend == "ray" and tp_size == 2 and pp_size == 2
and chunked_prefill):
# Test Ray ADAG for a subset of the tests
pp_env = {
"VLLM_USE_RAY_COMPILED_DAG": "1",
"VLLM_USE_RAY_SPMD_WORKER": "1",
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
}
# Temporary. Currently when zeromq + SPMD is used, it does not properly
# terminate because of aDAG issue.
common_args.append("--disable-frontend-multiprocessing")
else:
pp_env = None
pp_args = [
*common_args,
"--pipeline-parallel-size", "--pipeline-parallel-size",
str(PP_SIZE), str(pp_size),
"--tensor-parallel-size", "--tensor-parallel-size",
str(TP_SIZE), str(tp_size),
"--distributed-executor-backend", "--distributed-executor-backend",
DIST_BACKEND, distributed_backend,
] ]
# compare without pipeline parallelism # compare without pipeline parallelism
...@@ -55,54 +275,103 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME, ...@@ -55,54 +275,103 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
# schedule all workers in a node other than the head node, # schedule all workers in a node other than the head node,
# which can cause the test to fail. # which can cause the test to fail.
tp_args = [ tp_args = [
# use half precision for speed and memory savings in CI environment *common_args,
"--dtype",
"bfloat16",
"--tensor-parallel-size", "--tensor-parallel-size",
str(max(TP_SIZE, 2)), # We only use 2 GPUs in the CI. str(tp_size),
"--distributed-executor-backend", "--distributed-executor-backend",
"mp", "mp",
] ]
if CHUNKED_PREFILL:
pp_args.append("--enable-chunked-prefill")
tp_args.append("--enable-chunked-prefill")
if EAGER_MODE:
pp_args.append("--enforce-eager")
tp_args.append("--enforce-eager")
pp_env = None
if USE_RAY_ADAG:
assert DIST_BACKEND == "ray", (
"Ray ADAG is only supported with Ray distributed backend")
pp_env = {
"VLLM_USE_RAY_COMPILED_DAG": "1",
"VLLM_USE_RAY_SPMD_WORKER": "1",
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL":
str(int(USE_RAY_ADAG_NCCL)),
}
compare_two_settings(MODEL_NAME, pp_args, tp_args, pp_env) try:
compare_two_settings(model_name,
pp_args,
tp_args,
pp_env,
method=method)
except Exception:
if pp_env is None:
raise
else:
# Ray ADAG tests are flaky, so we don't want to fail the test
logger.exception("Ray ADAG tests failed")
@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [ @pytest.mark.parametrize(
(2, "JackFram/llama-160m"), ("model_name", "parallel_setup", "distributed_backend",
]) "trust_remote_code", "tokenizer_mode"),
@pytest.mark.parametrize("ATTN_BACKEND", [ [
"FLASH_ATTN", params for model_name, settings in GENERATION_MODEL_SETTINGS.items()
"FLASHINFER", for params in settings.iter_params(model_name)
]) if model_name in TEST_MODELS
],
)
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND): def test_tp_language_generation(
cudagraph_args = [ model_name: str,
# use half precision for speed and memory savings in CI environment parallel_setup: ParallelSetup,
"--dtype", distributed_backend: str,
"float16", trust_remote_code: bool,
"--pipeline-parallel-size", tokenizer_mode: Optional[str],
str(PP_SIZE), num_gpus_available,
"--distributed-executor-backend", ):
"mp", _compare_tp(model_name,
] parallel_setup,
os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND distributed_backend,
trust_remote_code,
tokenizer_mode,
num_gpus_available,
method="generate")
eager_args = cudagraph_args + ["--enforce-eager"] @pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend",
"trust_remote_code", "tokenizer_mode"),
[
params for model_name, settings in EMBEDDING_MODEL_SETTINGS.items()
for params in settings.iter_params(model_name)
if model_name in TEST_MODELS
],
)
@fork_new_process_for_each_test
def test_tp_language_embedding(
model_name: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
trust_remote_code: bool,
tokenizer_mode: Optional[str],
num_gpus_available,
):
_compare_tp(model_name,
parallel_setup,
distributed_backend,
trust_remote_code,
tokenizer_mode,
num_gpus_available,
method="encode")
compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
@pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend",
"trust_remote_code", "tokenizer_mode"),
[
params for model_name, settings in MULTIMODAL_MODEL_SETTINGS.items()
for params in settings.iter_params(model_name)
if model_name in TEST_MODELS
],
)
@fork_new_process_for_each_test
def test_tp_multimodal_generation(
model_name: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
trust_remote_code: bool,
tokenizer_mode: Optional[str],
num_gpus_available,
):
_compare_tp(model_name,
parallel_setup,
distributed_backend,
trust_remote_code,
tokenizer_mode,
num_gpus_available,
method="generate")
import os
import pytest
from ..utils import compare_two_settings, fork_new_process_for_each_test
@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
(2, "JackFram/llama-160m"),
])
@pytest.mark.parametrize("ATTN_BACKEND", [
"FLASH_ATTN",
"FLASHINFER",
])
@fork_new_process_for_each_test
def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
cudagraph_args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
"--pipeline-parallel-size",
str(PP_SIZE),
"--distributed-executor-backend",
"mp",
]
os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
eager_args = cudagraph_args + ["--enforce-eager"]
compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
import os import os
import torch import torch.distributed as dist
from vllm.distributed.parallel_state import in_the_same_node_as from vllm.distributed.parallel_state import in_the_same_node_as
torch.distributed.init_process_group(backend="gloo") if __name__ == "__main__":
test_result = all( dist.init_process_group(backend="gloo")
in_the_same_node_as(torch.distributed.group.WORLD, source_rank=0)) test_result = all(in_the_same_node_as(dist.group.WORLD, source_rank=0))
expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1" expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
assert test_result == expected, f"Expected {expected}, got {test_result}" assert test_result == expected, f"Expected {expected}, got {test_result}"
print("Same node test passed!") print("Same node test passed!")
"""E2E tests to verify the correctness of the encoder-decoder framework
Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
"""
from typing import List, Optional, Tuple
import pytest
from transformers import AutoModelForSeq2SeqLM
from vllm.sequence import SampleLogprobs
from vllm.utils import is_cpu
from ..conftest import DecoderPromptType
from ..models.utils import check_logprobs_close
def vllm_to_hf_output(
vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]],
decoder_prompt_type: DecoderPromptType,
):
"""Sanitize vllm output to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
hf_output_str = output_str + "</s>"
if decoder_prompt_type == DecoderPromptType.NONE:
hf_output_str = "<s>" + hf_output_str
return output_ids, hf_output_str, out_logprobs
@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
@pytest.mark.parametrize("enforce_eager", [True, False])
@pytest.mark.skipif(
is_cpu(),
reason="CPU backend is not currently supported with encoder/decoder models"
)
def test_encoder_decoder_e2e(
hf_runner,
vllm_runner,
example_encoder_decoder_prompts,
model: str,
dtype: str,
max_tokens: int,
num_logprobs: int,
decoder_prompt_type: DecoderPromptType,
enforce_eager: bool,
) -> None:
'''
End-to-End (E2E) test for the encoder-decoder framework.
This test evaluates the encoder-decoder functionality using the BART
model. We compare the outputs of the Hugging Face and vLLM
implementations to ensure that both implementations produce consistent
and correct results.
'''
test_case_prompts = example_encoder_decoder_prompts[decoder_prompt_type]
# Configuration settings for HF baseline
hf_kwargs = {
"top_k": None,
"num_beams": 1,
"repetition_penalty": 1.0,
"top_p": 1.0,
"length_penalty": 1.0,
"early_stopping": False,
"no_repeat_ngram_size": None,
"min_length": 0
}
with hf_runner(model, dtype=dtype,
auto_cls=AutoModelForSeq2SeqLM) as hf_model:
hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
test_case_prompts,
max_tokens,
num_logprobs,
**hf_kwargs,
))
with vllm_runner(model, dtype=dtype,
enforce_eager=enforce_eager) as vllm_model:
vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
test_case_prompts, max_tokens, num_logprobs)
hf_skip_tokens = (1
if decoder_prompt_type == DecoderPromptType.NONE else 0)
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=[
vllm_to_hf_output(vllm_output, decoder_prompt_type)
for vllm_output in vllm_outputs
],
name_0="hf",
name_1="vllm",
num_outputs_0_skip_tokens=hf_skip_tokens,
)
from argparse import ArgumentTypeError
import pytest
from vllm.engine.arg_utils import EngineArgs, nullable_kvs
from vllm.utils import FlexibleArgumentParser
@pytest.mark.parametrize(("arg", "expected"), [
(None, None),
("image=16", {
"image": 16
}),
("image=16,video=2", {
"image": 16,
"video": 2
}),
("Image=16, Video=2", {
"image": 16,
"video": 2
}),
])
def test_limit_mm_per_prompt_parser(arg, expected):
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
if arg is None:
args = parser.parse_args([])
else:
args = parser.parse_args(["--limit-mm-per-prompt", arg])
assert args.limit_mm_per_prompt == expected
@pytest.mark.parametrize(
("arg"),
[
"image", # Missing =
"image=4,image=5", # Conflicting values
"image=video=4" # Too many = in tokenized arg
])
def test_bad_nullable_kvs(arg):
with pytest.raises(ArgumentTypeError):
nullable_kvs(arg)
# yapf: disable
@pytest.mark.parametrize(("arg", "expected", "option"), [
(None, None, "mm-processor-kwargs"),
("{}", {}, "mm-processor-kwargs"),
(
'{"num_crops": 4}',
{
"num_crops": 4
},
"mm-processor-kwargs"
),
(
'{"foo": {"bar": "baz"}}',
{
"foo":
{
"bar": "baz"
}
},
"mm-processor-kwargs"
),
(
'{"cast_logits_dtype":"bfloat16","sequence_parallel_norm":true,"sequence_parallel_norm_threshold":2048}',
{
"cast_logits_dtype": "bfloat16",
"sequence_parallel_norm": True,
"sequence_parallel_norm_threshold": 2048,
},
"override-neuron-config"
),
])
# yapf: enable
def test_composite_arg_parser(arg, expected, option):
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
if arg is None:
args = parser.parse_args([])
else:
args = parser.parse_args([f"--{option}", arg])
assert getattr(args, option.replace("-", "_")) == expected
...@@ -48,9 +48,9 @@ def test_custom_executor_type_checking(model): ...@@ -48,9 +48,9 @@ def test_custom_executor_type_checking(model):
@pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("model", ["facebook/opt-125m"])
def test_custom_executor(model, tmpdir): def test_custom_executor(model, tmp_path):
cwd = os.path.abspath(".") cwd = os.path.abspath(".")
os.chdir(tmpdir) os.chdir(tmp_path)
try: try:
assert not os.path.exists(".marker") assert not os.path.exists(".marker")
...@@ -68,9 +68,9 @@ def test_custom_executor(model, tmpdir): ...@@ -68,9 +68,9 @@ def test_custom_executor(model, tmpdir):
@pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("model", ["facebook/opt-125m"])
def test_custom_executor_async(model, tmpdir): def test_custom_executor_async(model, tmp_path):
cwd = os.path.abspath(".") cwd = os.path.abspath(".")
os.chdir(tmpdir) os.chdir(tmp_path)
try: try:
assert not os.path.exists(".marker") assert not os.path.exists(".marker")
......
...@@ -83,7 +83,7 @@ def test_local_workers() -> None: ...@@ -83,7 +83,7 @@ def test_local_workers() -> None:
workers[3].process.kill() workers[3].process.kill()
# Other workers should get shut down here # Other workers should get shut down here
worker_monitor.join(2) worker_monitor.join(20)
# Ensure everything is stopped # Ensure everything is stopped
assert not worker_monitor.is_alive() assert not worker_monitor.is_alive()
...@@ -108,7 +108,7 @@ def test_local_workers_clean_shutdown() -> None: ...@@ -108,7 +108,7 @@ def test_local_workers_clean_shutdown() -> None:
# Clean shutdown # Clean shutdown
worker_monitor.close() worker_monitor.close()
worker_monitor.join(5) worker_monitor.join(20)
# Ensure everything is stopped # Ensure everything is stopped
assert not worker_monitor.is_alive() assert not worker_monitor.is_alive()
...@@ -161,7 +161,7 @@ async def test_local_workers_async() -> None: ...@@ -161,7 +161,7 @@ async def test_local_workers_async() -> None:
workers[3].process.kill() workers[3].process.kill()
# Other workers should get shut down here # Other workers should get shut down here
worker_monitor.join(2) worker_monitor.join(20)
# Ensure everything is stopped # Ensure everything is stopped
assert not worker_monitor.is_alive() assert not worker_monitor.is_alive()
......
...@@ -11,9 +11,10 @@ def test_skip_tokenizer_initialization(model: str): ...@@ -11,9 +11,10 @@ def test_skip_tokenizer_initialization(model: str):
# token ids. # token ids.
llm = LLM(model=model, skip_tokenizer_init=True) llm = LLM(model=model, skip_tokenizer_init=True)
sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True) sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
with pytest.raises(ValueError) as err:
with pytest.raises(ValueError, match="cannot pass text prompts when"):
llm.generate("abc", sampling_params) llm.generate("abc", sampling_params)
assert "prompts must be None if" in str(err.value)
outputs = llm.generate({"prompt_token_ids": [1, 2, 3]}, outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
sampling_params=sampling_params) sampling_params=sampling_params)
assert len(outputs) > 0 assert len(outputs) > 0
......
...@@ -7,6 +7,8 @@ from vllm import CompletionOutput, LLMEngine, SamplingParams ...@@ -7,6 +7,8 @@ from vllm import CompletionOutput, LLMEngine, SamplingParams
MODEL = "meta-llama/llama-2-7b-hf" MODEL = "meta-llama/llama-2-7b-hf"
MAX_TOKENS = 200 MAX_TOKENS = 200
IS_ASYNC = False
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def vllm_model(vllm_runner): def vllm_model(vllm_runner):
...@@ -14,99 +16,148 @@ def vllm_model(vllm_runner): ...@@ -14,99 +16,148 @@ def vllm_model(vllm_runner):
yield vllm_model yield vllm_model
@pytest.mark.skip_global_cleanup def _test_stopping(llm_engine: LLMEngine,
def test_stop_basic(vllm_model): expected_output: str,
_test_stopping(vllm_model.model.llm_engine, expected_reason: Any,
stop: Optional[List[str]] = None,
stop_token_ids: Optional[List[int]] = None,
include_in_output: bool = False,
use_async_output_proc: bool = False) -> None:
llm_engine.add_request(
"id", "A story about vLLM:\n",
SamplingParams(
temperature=0.0,
max_tokens=MAX_TOKENS,
stop=stop,
stop_token_ids=stop_token_ids,
include_stop_str_in_output=include_in_output,
), None)
output: Optional[CompletionOutput] = None
output_text = ""
stop_reason = None
if use_async_output_proc:
llm_engine.step()
while llm_engine.has_unfinished_requests():
(request_output, ) = llm_engine.step()
(output, ) = request_output.outputs
# Ensure we don't backtrack
assert output.text.startswith(output_text)
output_text = output.text
stop_reason = output.stop_reason
assert output is not None
assert output_text == expected_output
assert stop_reason == expected_reason
def _set_async_mode(llm_engine, is_async):
llm_engine.scheduler[0].use_async_output_proc = is_async
def _stop_basic(llm_engine, is_async):
_test_stopping(llm_engine,
stop=["."], stop=["."],
include_in_output=False, include_in_output=False,
expected_output="VLLM is a 100% volunteer organization", expected_output="VLLM is a 100% volunteer organization",
expected_reason=".") expected_reason=".",
use_async_output_proc=is_async)
_test_stopping(vllm_model.model.llm_engine, _test_stopping(llm_engine,
stop=["."], stop=["."],
include_in_output=True, include_in_output=True,
expected_output="VLLM is a 100% volunteer organization.", expected_output="VLLM is a 100% volunteer organization.",
expected_reason=".") expected_reason=".",
use_async_output_proc=is_async)
@pytest.mark.skip_global_cleanup def _stop_multi_tokens(llm_engine, is_async):
def test_stop_multi_tokens(vllm_model):
_test_stopping( _test_stopping(
vllm_model.model.llm_engine, llm_engine,
stop=["group of peo", "short"], stop=["group of peo", "short"],
include_in_output=False, include_in_output=False,
expected_output="VLLM is a 100% volunteer organization. We are a ", expected_output="VLLM is a 100% volunteer organization. We are a ",
expected_reason="group of peo") expected_reason="group of peo",
use_async_output_proc=is_async)
_test_stopping( _test_stopping(
vllm_model.model.llm_engine, llm_engine,
stop=["group of peo", "short"], stop=["group of peo", "short"],
include_in_output=True, include_in_output=True,
expected_output= expected_output=
"VLLM is a 100% volunteer organization. We are a group of peo", "VLLM is a 100% volunteer organization. We are a group of peo",
expected_reason="group of peo") expected_reason="group of peo",
use_async_output_proc=is_async)
@pytest.mark.skip_global_cleanup def _stop_partial_token(llm_engine, is_async):
def test_stop_partial_token(vllm_model): _test_stopping(llm_engine,
_test_stopping(vllm_model.model.llm_engine,
stop=["gani"], stop=["gani"],
include_in_output=False, include_in_output=False,
expected_output="VLLM is a 100% volunteer or", expected_output="VLLM is a 100% volunteer or",
expected_reason="gani") expected_reason="gani",
use_async_output_proc=is_async)
_test_stopping(vllm_model.model.llm_engine, _test_stopping(llm_engine,
stop=["gani"], stop=["gani"],
include_in_output=True, include_in_output=True,
expected_output="VLLM is a 100% volunteer organi", expected_output="VLLM is a 100% volunteer organi",
expected_reason="gani") expected_reason="gani",
use_async_output_proc=is_async)
@pytest.mark.skip_global_cleanup def _stop_token_id(llm_engine, is_async):
def test_stop_token_id(vllm_model):
# token id 13013 => " organization" # token id 13013 => " organization"
_test_stopping(vllm_model.model.llm_engine, _test_stopping(llm_engine,
stop_token_ids=[13013], stop_token_ids=[13013],
include_in_output=False, include_in_output=False,
expected_output="VLLM is a 100% volunteer", expected_output="VLLM is a 100% volunteer",
expected_reason=13013) expected_reason=13013,
use_async_output_proc=is_async)
_test_stopping(vllm_model.model.llm_engine, _test_stopping(llm_engine,
stop_token_ids=[13013], stop_token_ids=[13013],
include_in_output=True, include_in_output=True,
expected_output="VLLM is a 100% volunteer organization", expected_output="VLLM is a 100% volunteer organization",
expected_reason=13013) expected_reason=13013,
use_async_output_proc=is_async)
def _test_stopping(llm_engine: LLMEngine, @pytest.mark.skip_global_cleanup
expected_output: str, def test_stop_basic(vllm_model):
expected_reason: Any, _set_async_mode(vllm_model.model.llm_engine, True)
stop: Optional[List[str]] = None, _stop_basic(vllm_model.model.llm_engine, is_async=True)
stop_token_ids: Optional[List[int]] = None,
include_in_output: bool = False) -> None:
llm_engine.add_request(
"id", "A story about vLLM:\n",
SamplingParams(
temperature=0.0,
max_tokens=MAX_TOKENS,
stop=stop,
stop_token_ids=stop_token_ids,
include_stop_str_in_output=include_in_output,
), None)
output: Optional[CompletionOutput] = None _set_async_mode(vllm_model.model.llm_engine, False)
output_text = "" _stop_basic(vllm_model.model.llm_engine, is_async=False)
stop_reason = None
while llm_engine.has_unfinished_requests():
(request_output, ) = llm_engine.step()
(output, ) = request_output.outputs
# Ensure we don't backtrack
assert output.text.startswith(output_text)
output_text = output.text
stop_reason = output.stop_reason
assert output is not None @pytest.mark.skip_global_cleanup
assert output_text == expected_output def test_stop_multi_tokens(vllm_model):
assert stop_reason == expected_reason _set_async_mode(vllm_model.model.llm_engine, True)
_stop_multi_tokens(vllm_model.model.llm_engine, is_async=True)
_set_async_mode(vllm_model.model.llm_engine, False)
_stop_multi_tokens(vllm_model.model.llm_engine, is_async=False)
@pytest.mark.skip_global_cleanup
def test_stop_partial_token(vllm_model):
_set_async_mode(vllm_model.model.llm_engine, True)
_stop_partial_token(vllm_model.model.llm_engine, is_async=True)
_set_async_mode(vllm_model.model.llm_engine, False)
_stop_partial_token(vllm_model.model.llm_engine, is_async=False)
@pytest.mark.skip_global_cleanup
def test_stop_token_id(vllm_model):
_set_async_mode(vllm_model.model.llm_engine, True)
_stop_token_id(vllm_model.model.llm_engine, is_async=True)
_set_async_mode(vllm_model.model.llm_engine, False)
_stop_token_id(vllm_model.model.llm_engine, is_async=False)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment