Commit 539aa992 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.2' into v0.6.2-dev

parents 93872128 7193774b
import os
import torch
from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from vllm.plugins import set_torch_compile_backend
from vllm.utils import is_hip
TEST_MODELS_SMOKE = [
("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
"quantization": "compressed-tensors"
}),
("meta-llama/Meta-Llama-3-8B", {}),
]
TEST_MODELS = [
("facebook/opt-125m", {}),
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", {
"dtype": torch.float16,
"quantization": "fp8"
}),
("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
"quantization": "compressed-tensors"
}),
("meta-llama/Meta-Llama-3-8B", {}),
]
# TODO: enable in pytorch 2.5
if False and is_quant_method_supported("aqlm"): # noqa: SIM223
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
"quantization": "aqlm"
}))
# TODO: enable in pytorch 2.5
if False and is_quant_method_supported("gguf"): # noqa: SIM223
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
"quantization": "gguf"
}))
if is_quant_method_supported("gptq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
"quantization": "gptq"
}))
if is_quant_method_supported("gptq_marlin"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
"quantization": "gptq_marlin"
}))
if is_quant_method_supported("gptq_marlin_24"):
TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
"quantization": "gptq_marlin_24"
}))
if is_quant_method_supported("marlin"):
TEST_MODELS.append(("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
"quantization": "marlin"
}))
if not is_hip() and is_quant_method_supported("awq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
"quantization": "AWQ"
}))
def check_full_graph_support(model, model_kwargs, backend, tp_size=1):
# make sure these models can be captured in full graph mode
if "VLLM_TEST_DYNAMO_GRAPH_CAPTURE" not in os.environ:
os.environ["VLLM_TEST_DYNAMO_GRAPH_CAPTURE"] = "1"
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
# Inductor doesn't support fp8/gptq_marlin_24 yet.
quantization = model_kwargs.get("quantization")
if (quantization == "fp8" or quantization == "gptq_marlin"
or quantization == "gptq_marlin_24") and backend != "eager":
return
set_torch_compile_backend(backend)
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0)
llm = LLM(model=model,
enforce_eager=True,
tensor_parallel_size=tp_size,
disable_custom_all_reduce=True,
**model_kwargs)
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
...@@ -20,6 +20,8 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding, ...@@ -20,6 +20,8 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
BatchFeature) BatchFeature)
from transformers.models.auto.auto_factory import _BaseAutoModelClass from transformers.models.auto.auto_factory import _BaseAutoModelClass
from tests.models.utils import (TokensTextLogprobs,
TokensTextLogprobsPromptLogprobs)
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset from vllm.assets.video import VideoAsset
...@@ -33,7 +35,6 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, ...@@ -33,7 +35,6 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
to_enc_dec_tuple_list, zip_enc_dec_prompts) to_enc_dec_tuple_list, zip_enc_dec_prompts)
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.sequence import SampleLogprobs
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless, from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
identity, is_cpu) identity, is_cpu)
...@@ -158,10 +159,7 @@ def should_do_global_cleanup_after_test(request) -> bool: ...@@ -158,10 +159,7 @@ def should_do_global_cleanup_after_test(request) -> bool:
to initialize torch. to initialize torch.
""" """
if request.node.get_closest_marker("skip_global_cleanup"): return not request.node.get_closest_marker("skip_global_cleanup")
return False
return True
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
...@@ -171,6 +169,12 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool): ...@@ -171,6 +169,12 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
cleanup() cleanup()
@pytest.fixture(autouse=True)
def dynamo_reset():
yield
torch._dynamo.reset()
@pytest.fixture @pytest.fixture
def example_prompts() -> List[str]: def example_prompts() -> List[str]:
prompts = [] prompts = []
...@@ -472,7 +476,7 @@ class HfRunner: ...@@ -472,7 +476,7 @@ class HfRunner:
audios: Optional[PromptAudioInput] = None, audios: Optional[PromptAudioInput] = None,
videos: Optional[List[np.ndarray]] = None, videos: Optional[List[np.ndarray]] = None,
**kwargs: Any, **kwargs: Any,
) -> List[Tuple[List[int], str, List[Dict[int, float]]]]: ) -> List[TokensTextLogprobs]:
all_logprobs: List[List[Dict[int, float]]] = [] all_logprobs: List[List[Dict[int, float]]] = []
all_output_ids: List[List[int]] = [] all_output_ids: List[List[int]] = []
all_output_strs: List[str] = [] all_output_strs: List[str] = []
...@@ -528,7 +532,7 @@ class HfRunner: ...@@ -528,7 +532,7 @@ class HfRunner:
max_tokens: int, max_tokens: int,
num_logprobs: int, num_logprobs: int,
**kwargs: Any, **kwargs: Any,
) -> List[Tuple[List[int], str, List[Dict[int, float]]]]: ) -> List[TokensTextLogprobs]:
''' '''
Greedy logprobs generation for vLLM encoder/decoder models Greedy logprobs generation for vLLM encoder/decoder models
''' '''
...@@ -656,14 +660,16 @@ class VllmRunner: ...@@ -656,14 +660,16 @@ class VllmRunner:
@staticmethod @staticmethod
def _final_steps_generate_w_logprobs( def _final_steps_generate_w_logprobs(
req_outputs: List[RequestOutput], req_outputs: List[RequestOutput],
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]: ) -> List[TokensTextLogprobsPromptLogprobs]:
outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = [] outputs: List[TokensTextLogprobsPromptLogprobs] = []
for req_output in req_outputs: for req_output in req_outputs:
assert len(req_output.outputs) > 0
for sample in req_output.outputs: for sample in req_output.outputs:
output_str = sample.text output_str = sample.text
output_ids = list(sample.token_ids) output_ids = list(sample.token_ids)
output_logprobs = sample.logprobs output_logprobs = sample.logprobs
outputs.append((output_ids, output_str, output_logprobs)) outputs.append((output_ids, output_str, output_logprobs,
req_output.prompt_logprobs))
return outputs return outputs
def generate_w_logprobs( def generate_w_logprobs(
...@@ -673,9 +679,8 @@ class VllmRunner: ...@@ -673,9 +679,8 @@ class VllmRunner:
images: Optional[PromptImageInput] = None, images: Optional[PromptImageInput] = None,
audios: Optional[PromptAudioInput] = None, audios: Optional[PromptAudioInput] = None,
videos: Optional[PromptVideoInput] = None, videos: Optional[PromptVideoInput] = None,
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]: ) -> Union[List[TokensTextLogprobs],
assert sampling_params.logprobs is not None List[TokensTextLogprobsPromptLogprobs]]:
if images is not None: if images is not None:
assert len(prompts) == len(images) assert len(prompts) == len(images)
...@@ -698,13 +703,20 @@ class VllmRunner: ...@@ -698,13 +703,20 @@ class VllmRunner:
req_outputs = self.model.generate(inputs, req_outputs = self.model.generate(inputs,
sampling_params=sampling_params) sampling_params=sampling_params)
return self._final_steps_generate_w_logprobs(req_outputs)
toks_str_logsprobs_prompt_logprobs = (
self._final_steps_generate_w_logprobs(req_outputs))
# Omit prompt logprobs if not required by sampling params
return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
if sampling_params.prompt_logprobs is None else
toks_str_logsprobs_prompt_logprobs)
def generate_encoder_decoder_w_logprobs( def generate_encoder_decoder_w_logprobs(
self, self,
encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]], encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
sampling_params: SamplingParams, sampling_params: SamplingParams,
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]: ) -> Union[List[TokensTextLogprobs],
List[TokensTextLogprobsPromptLogprobs]]:
''' '''
Logprobs generation for vLLM encoder/decoder models Logprobs generation for vLLM encoder/decoder models
''' '''
...@@ -712,7 +724,12 @@ class VllmRunner: ...@@ -712,7 +724,12 @@ class VllmRunner:
assert sampling_params.logprobs is not None assert sampling_params.logprobs is not None
req_outputs = self.model.generate(encoder_decoder_prompts, req_outputs = self.model.generate(encoder_decoder_prompts,
sampling_params=sampling_params) sampling_params=sampling_params)
return self._final_steps_generate_w_logprobs(req_outputs) toks_str_logsprobs_prompt_logprobs = (
self._final_steps_generate_w_logprobs(req_outputs))
# Omit prompt logprobs if not required by sampling params
return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
if sampling_params.prompt_logprobs is None else
toks_str_logsprobs_prompt_logprobs)
def generate_greedy( def generate_greedy(
self, self,
...@@ -730,44 +747,48 @@ class VllmRunner: ...@@ -730,44 +747,48 @@ class VllmRunner:
prompts: List[str], prompts: List[str],
max_tokens: int, max_tokens: int,
num_logprobs: int, num_logprobs: int,
num_prompt_logprobs: Optional[int] = None,
images: Optional[PromptImageInput] = None, images: Optional[PromptImageInput] = None,
audios: Optional[PromptAudioInput] = None, audios: Optional[PromptAudioInput] = None,
videos: Optional[PromptVideoInput] = None, videos: Optional[PromptVideoInput] = None,
stop_token_ids: Optional[List[int]] = None, stop_token_ids: Optional[List[int]] = None,
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]: ) -> Union[List[TokensTextLogprobs],
greedy_logprobs_params = SamplingParams(temperature=0.0, List[TokensTextLogprobsPromptLogprobs]]:
max_tokens=max_tokens, greedy_logprobs_params = SamplingParams(
logprobs=num_logprobs, temperature=0.0,
stop_token_ids=stop_token_ids) max_tokens=max_tokens,
outputs = self.generate_w_logprobs(prompts, logprobs=num_logprobs,
greedy_logprobs_params, prompt_logprobs=num_prompt_logprobs,
images=images, stop_token_ids=stop_token_ids)
audios=audios,
videos=videos) return self.generate_w_logprobs(prompts,
greedy_logprobs_params,
return [(output_ids, output_str, output_logprobs) images=images,
for output_ids, output_str, output_logprobs in outputs] audios=audios,
videos=videos)
def generate_encoder_decoder_greedy_logprobs( def generate_encoder_decoder_greedy_logprobs(
self, self,
encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]], encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
max_tokens: int, max_tokens: int,
num_logprobs: int, num_logprobs: int,
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]: num_prompt_logprobs: Optional[int] = None,
greedy_logprobs_params = SamplingParams(temperature=0.0, ) -> Union[List[TokensTextLogprobs],
use_beam_search=False, List[TokensTextLogprobsPromptLogprobs]]:
max_tokens=max_tokens, greedy_logprobs_params = SamplingParams(
logprobs=num_logprobs) temperature=0.0,
use_beam_search=False,
max_tokens=max_tokens,
logprobs=num_logprobs,
prompt_logprobs=(num_prompt_logprobs),
)
''' '''
Greedy logprobs generation for vLLM encoder/decoder models Greedy logprobs generation for vLLM encoder/decoder models
''' '''
outputs = self.generate_encoder_decoder_w_logprobs( return self.generate_encoder_decoder_w_logprobs(
encoder_decoder_prompts, greedy_logprobs_params) encoder_decoder_prompts, greedy_logprobs_params)
return [(output_ids, output_str, output_logprobs)
for output_ids, output_str, output_logprobs in outputs]
def generate_beam_search( def generate_beam_search(
self, self,
prompts: List[str], prompts: List[str],
...@@ -781,6 +802,20 @@ class VllmRunner: ...@@ -781,6 +802,20 @@ class VllmRunner:
outputs = self.generate(prompts, beam_search_params) outputs = self.generate(prompts, beam_search_params)
return outputs return outputs
def generate_beam_search_new(
self,
prompts: Union[List[str], List[List[int]]],
beam_width: int,
max_tokens: int,
) -> List[Tuple[List[List[int]], List[str]]]:
outputs = self.model.beam_search(prompts, beam_width, max_tokens)
returned_outputs = []
for output in outputs:
token_ids = [x.tokens for x in output.sequences]
texts = [x.text for x in output.sequences]
returned_outputs.append((token_ids, texts))
return returned_outputs
def encode(self, prompts: List[str]) -> List[List[float]]: def encode(self, prompts: List[str]) -> List[List[float]]:
req_outputs = self.model.encode(prompts) req_outputs = self.model.encode(prompts)
outputs = [] outputs = []
......
...@@ -27,16 +27,19 @@ def schedule_and_update_computed_tokens(scheduler): ...@@ -27,16 +27,19 @@ def schedule_and_update_computed_tokens(scheduler):
return metas, out return metas, out
def test_simple(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_simple(use_v2_block_manager: bool):
"""Verify basic scheduling works.""" """Verify basic scheduling works."""
block_size = 4 block_size = 4
num_seq_group = 4 num_seq_group = 4
max_model_len = 16 max_model_len = 16
max_num_batched_tokens = 64 max_num_batched_tokens = 64
scheduler_config = SchedulerConfig(max_num_batched_tokens, scheduler_config = SchedulerConfig(
num_seq_group, max_num_batched_tokens,
max_model_len, num_seq_group,
enable_chunked_prefill=True) max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8 cache_config.num_gpu_blocks = 8
...@@ -45,7 +48,9 @@ def test_simple(): ...@@ -45,7 +48,9 @@ def test_simple():
# Add seq groups to scheduler. # Add seq groups to scheduler.
for i in range(num_seq_group): for i in range(num_seq_group):
_, seq_group = create_dummy_prompt(str(i), prompt_length=block_size) _, seq_group = create_dummy_prompt(str(i),
prompt_length=block_size,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
running.append(seq_group) running.append(seq_group)
...@@ -69,30 +74,36 @@ def test_simple(): ...@@ -69,30 +74,36 @@ def test_simple():
assert len(seq_group_meta) == num_seq_group assert len(seq_group_meta) == num_seq_group
def test_chunk(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_chunk(use_v2_block_manager: bool):
"""Verify prefills are chunked properly.""" """Verify prefills are chunked properly."""
block_size = 4 block_size = 4
max_seqs = 60 max_seqs = 60
max_model_len = 80 max_model_len = 80
max_num_batched_tokens = 64 max_num_batched_tokens = 64
scheduler_config = SchedulerConfig(max_num_batched_tokens, scheduler_config = SchedulerConfig(
max_seqs, max_num_batched_tokens,
max_model_len, max_seqs,
enable_chunked_prefill=True) max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = 32
cache_config.num_gpu_blocks = 8 cache_config.num_gpu_blocks = 32
scheduler = Scheduler(scheduler_config, cache_config, None) scheduler = Scheduler(scheduler_config, cache_config, None)
running: List[SequenceGroup] = [] running: List[SequenceGroup] = []
# Add seq groups to scheduler. # Add seq groups to scheduler.
for i in range(2): for i in range(2):
_, seq_group = create_dummy_prompt(str(i), prompt_length=60) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
running.append(seq_group) running.append(seq_group)
# Verify the second request is chunked. # Verify the second request is chunked.
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
print()
assert set(get_sequence_groups(out)) == set(running) assert set(get_sequence_groups(out)) == set(running)
assert seq_group_meta[0].token_chunk_size == 60 assert seq_group_meta[0].token_chunk_size == 60
# Verify it is chunked. # Verify it is chunked.
...@@ -113,24 +124,29 @@ def test_chunk(): ...@@ -113,24 +124,29 @@ def test_chunk():
assert out.num_batched_tokens == 57 assert out.num_batched_tokens == 57
def test_complex(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_complex(use_v2_block_manager: bool):
block_size = 4 block_size = 4
max_seqs = 60 max_seqs = 60
max_model_len = 80 max_model_len = 80
max_num_batched_tokens = 64 max_num_batched_tokens = 64
scheduler_config = SchedulerConfig(max_num_batched_tokens, scheduler_config = SchedulerConfig(
max_seqs, max_num_batched_tokens,
max_model_len, max_seqs,
enable_chunked_prefill=True) max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = 64
cache_config.num_gpu_blocks = 8 cache_config.num_gpu_blocks = 64
scheduler = Scheduler(scheduler_config, cache_config, None) scheduler = Scheduler(scheduler_config, cache_config, None)
running: List[SequenceGroup] = [] running: List[SequenceGroup] = []
# Add seq groups to scheduler. # Add seq groups to scheduler.
for i in range(2): for i in range(2):
_, seq_group = create_dummy_prompt(str(i), prompt_length=60) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
running.append(seq_group) running.append(seq_group)
assert seq_group.is_prefill() assert seq_group.is_prefill()
...@@ -151,7 +167,9 @@ def test_complex(): ...@@ -151,7 +167,9 @@ def test_complex():
# Add 2 more requests. # Add 2 more requests.
for i in range(2, 4): for i in range(2, 4):
_, seq_group = create_dummy_prompt(str(i), prompt_length=60) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
running.append(seq_group) running.append(seq_group)
...@@ -176,16 +194,19 @@ def test_complex(): ...@@ -176,16 +194,19 @@ def test_complex():
assert running[2].is_prefill() assert running[2].is_prefill()
def test_maximal_decoding(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_maximal_decoding(use_v2_block_manager: bool):
"""Verify decoding requests are prioritized.""" """Verify decoding requests are prioritized."""
block_size = 4 block_size = 4
max_seqs = 2 max_seqs = 2
max_model_len = 8 max_model_len = 8
max_num_batched_tokens = 2 max_num_batched_tokens = 2
scheduler_config = SchedulerConfig(max_num_batched_tokens, scheduler_config = SchedulerConfig(
max_seqs, max_num_batched_tokens,
max_model_len, max_seqs,
enable_chunked_prefill=True) max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8 cache_config.num_gpu_blocks = 8
...@@ -194,7 +215,9 @@ def test_maximal_decoding(): ...@@ -194,7 +215,9 @@ def test_maximal_decoding():
# Add seq groups to scheduler. # Add seq groups to scheduler.
for i in range(2): for i in range(2):
_, seq_group = create_dummy_prompt(str(i), prompt_length=2) _, seq_group = create_dummy_prompt(str(i),
prompt_length=2,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
running.append(seq_group) running.append(seq_group)
assert seq_group.is_prefill() assert seq_group.is_prefill()
...@@ -211,7 +234,9 @@ def test_maximal_decoding(): ...@@ -211,7 +234,9 @@ def test_maximal_decoding():
append_new_token(running[0], 1) append_new_token(running[0], 1)
# Create one more seq_group. # Create one more seq_group.
_, seq_group = create_dummy_prompt("3", prompt_length=2) _, seq_group = create_dummy_prompt("3",
prompt_length=2,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
running.append(seq_group) running.append(seq_group)
assert seq_group.is_prefill() assert seq_group.is_prefill()
...@@ -263,23 +288,28 @@ def test_maximal_decoding(): ...@@ -263,23 +288,28 @@ def test_maximal_decoding():
assert out.num_batched_tokens == 2 assert out.num_batched_tokens == 2
def test_prompt_limit(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_prompt_limit(use_v2_block_manager: bool):
"""Verify max_num_batched_tokens < max_model_len is possible.""" """Verify max_num_batched_tokens < max_model_len is possible."""
block_size = 4 block_size = 4
max_seqs = 32 max_seqs = 32
max_model_len = 64 max_model_len = 64
max_num_batched_tokens = 32 max_num_batched_tokens = 32
scheduler_config = SchedulerConfig(max_num_batched_tokens, scheduler_config = SchedulerConfig(
max_seqs, max_num_batched_tokens,
max_model_len, max_seqs,
enable_chunked_prefill=True) max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = 16
cache_config.num_gpu_blocks = 8 cache_config.num_gpu_blocks = 16
scheduler = Scheduler(scheduler_config, cache_config, None) scheduler = Scheduler(scheduler_config, cache_config, None)
running: List[SequenceGroup] = [] running: List[SequenceGroup] = []
_, seq_group = create_dummy_prompt("1", prompt_length=48) _, seq_group = create_dummy_prompt("1",
prompt_length=48,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
running.append(seq_group) running.append(seq_group)
assert seq_group.is_prefill() assert seq_group.is_prefill()
...@@ -293,7 +323,8 @@ def test_prompt_limit(): ...@@ -293,7 +323,8 @@ def test_prompt_limit():
assert out.num_batched_tokens == 32 assert out.num_batched_tokens == 32
def test_prompt_limit_exceed(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_prompt_limit_exceed(use_v2_block_manager: bool):
block_size = 4 block_size = 4
max_seqs = 64 max_seqs = 64
max_model_len = 32 max_model_len = 32
...@@ -303,12 +334,13 @@ def test_prompt_limit_exceed(): ...@@ -303,12 +334,13 @@ def test_prompt_limit_exceed():
max_model_len, max_model_len,
enable_chunked_prefill=True) enable_chunked_prefill=True)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = 16
cache_config.num_gpu_blocks = 8 cache_config.num_gpu_blocks = 16
scheduler = Scheduler(scheduler_config, cache_config, None) scheduler = Scheduler(scheduler_config, cache_config, None)
running: List[SequenceGroup] = [] running: List[SequenceGroup] = []
_, seq_group = create_dummy_prompt("2",
_, seq_group = create_dummy_prompt("2", prompt_length=48) prompt_length=48,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
running.append(seq_group) running.append(seq_group)
assert seq_group.is_prefill() assert seq_group.is_prefill()
...@@ -317,22 +349,28 @@ def test_prompt_limit_exceed(): ...@@ -317,22 +349,28 @@ def test_prompt_limit_exceed():
assert out.ignored_seq_groups[0] == seq_group assert out.ignored_seq_groups[0] == seq_group
def test_swap(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_swap(use_v2_block_manager: bool):
"""Verify swapping works with chunked prefill requests""" """Verify swapping works with chunked prefill requests"""
block_size = 4 block_size = 4
max_seqs = 30 max_seqs = 30
max_model_len = 200 max_model_len = 200
max_num_batched_tokens = 30 max_num_batched_tokens = 30
scheduler_config = SchedulerConfig(max_num_batched_tokens, scheduler_config = SchedulerConfig(
max_seqs, max_num_batched_tokens,
max_model_len, max_seqs,
enable_chunked_prefill=True) max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = 16
cache_config.num_gpu_blocks = 8 cache_config.num_gpu_blocks = 16
scheduler = Scheduler(scheduler_config, cache_config, None) scheduler = Scheduler(scheduler_config, cache_config, None)
_, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) _, seq_group = create_dummy_prompt("1",
prompt_length=60,
best_of=2,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
_, out = schedule_and_update_computed_tokens(scheduler) _, out = schedule_and_update_computed_tokens(scheduler)
# The request is chunked. # The request is chunked.
...@@ -369,21 +407,27 @@ def test_swap(): ...@@ -369,21 +407,27 @@ def test_swap():
assert out.blocks_to_swap_out == [] assert out.blocks_to_swap_out == []
def test_running_prefill_prioritized_over_swap(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
block_size = 4 block_size = 4
max_seqs = 30 max_seqs = 30
max_model_len = 200 max_model_len = 200
max_num_batched_tokens = 30 max_num_batched_tokens = 30
scheduler_config = SchedulerConfig(max_num_batched_tokens, scheduler_config = SchedulerConfig(
max_seqs, max_num_batched_tokens,
max_model_len, max_seqs,
enable_chunked_prefill=True) max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = 32
cache_config.num_gpu_blocks = 8 cache_config.num_gpu_blocks = 32
scheduler = Scheduler(scheduler_config, cache_config, None) scheduler = Scheduler(scheduler_config, cache_config, None)
_, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) _, seq_group = create_dummy_prompt("1",
prompt_length=60,
best_of=2,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
_, out = schedule_and_update_computed_tokens(scheduler) _, out = schedule_and_update_computed_tokens(scheduler)
# The request is chunked. # The request is chunked.
...@@ -413,7 +457,9 @@ def test_running_prefill_prioritized_over_swap(): ...@@ -413,7 +457,9 @@ def test_running_prefill_prioritized_over_swap():
scheduler.block_manager.can_swap_in = MagicMock() scheduler.block_manager.can_swap_in = MagicMock()
scheduler.block_manager.can_swap_in.return_value = AllocStatus.LATER scheduler.block_manager.can_swap_in.return_value = AllocStatus.LATER
_, seq_group2 = create_dummy_prompt("2", prompt_length=60) _, seq_group2 = create_dummy_prompt("2",
prompt_length=60,
block_size=block_size)
scheduler.add_seq_group(seq_group2) scheduler.add_seq_group(seq_group2)
_, out = schedule_and_update_computed_tokens(scheduler) _, out = schedule_and_update_computed_tokens(scheduler)
assert len(out.scheduled_seq_groups) == 1 assert len(out.scheduled_seq_groups) == 1
...@@ -455,22 +501,27 @@ def test_running_prefill_prioritized_over_swap(): ...@@ -455,22 +501,27 @@ def test_running_prefill_prioritized_over_swap():
assert out.blocks_to_swap_out == [] assert out.blocks_to_swap_out == []
def test_chunked_prefill_preempt(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_chunked_prefill_preempt(use_v2_block_manager: bool):
"""Verify preempt works with chunked prefill requests""" """Verify preempt works with chunked prefill requests"""
block_size = 4 block_size = 4
max_seqs = 30 max_seqs = 30
max_model_len = 200 max_model_len = 200
max_num_batched_tokens = 30 max_num_batched_tokens = 30
scheduler_config = SchedulerConfig(max_num_batched_tokens, scheduler_config = SchedulerConfig(
max_seqs, max_num_batched_tokens,
max_model_len, max_seqs,
enable_chunked_prefill=True) max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = 16
cache_config.num_gpu_blocks = 8 cache_config.num_gpu_blocks = 16
scheduler = Scheduler(scheduler_config, cache_config, None) scheduler = Scheduler(scheduler_config, cache_config, None)
_, seq_group = create_dummy_prompt("1", prompt_length=60) _, seq_group = create_dummy_prompt("1",
prompt_length=60,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
_, out = schedule_and_update_computed_tokens(scheduler) _, out = schedule_and_update_computed_tokens(scheduler)
# The request is chunked. # The request is chunked.
...@@ -517,22 +568,27 @@ def test_chunked_prefill_preempt(): ...@@ -517,22 +568,27 @@ def test_chunked_prefill_preempt():
assert out.num_batched_tokens == max_num_batched_tokens assert out.num_batched_tokens == max_num_batched_tokens
def test_chunked_prefill_max_seqs(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
block_size = 4 block_size = 4
max_seqs = 2 max_seqs = 2
max_model_len = 80 max_model_len = 80
max_num_batched_tokens = 64 max_num_batched_tokens = 64
scheduler_config = SchedulerConfig(max_num_batched_tokens, scheduler_config = SchedulerConfig(
max_seqs, max_num_batched_tokens,
max_model_len, max_seqs,
enable_chunked_prefill=True) max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = 128
cache_config.num_gpu_blocks = 8 cache_config.num_gpu_blocks = 128
scheduler = Scheduler(scheduler_config, cache_config, None) scheduler = Scheduler(scheduler_config, cache_config, None)
running: List[SequenceGroup] = [] running: List[SequenceGroup] = []
_, seq_group = create_dummy_prompt("1", prompt_length=65) _, seq_group = create_dummy_prompt("1",
prompt_length=65,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
running.append(seq_group) running.append(seq_group)
# The first prefill is chunked. # The first prefill is chunked.
...@@ -542,7 +598,9 @@ def test_chunked_prefill_max_seqs(): ...@@ -542,7 +598,9 @@ def test_chunked_prefill_max_seqs():
# Add new requests. # Add new requests.
for i in range(4): for i in range(4):
_, seq_group = create_dummy_prompt(str(i), prompt_length=65) _, seq_group = create_dummy_prompt(str(i),
prompt_length=65,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
running.append(seq_group) running.append(seq_group)
...@@ -564,16 +622,19 @@ def test_chunked_prefill_max_seqs(): ...@@ -564,16 +622,19 @@ def test_chunked_prefill_max_seqs():
assert not running[1].is_prefill() assert not running[1].is_prefill()
def test_perfix_caching(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_perfix_caching(use_v2_block_manager: bool):
"""Verify allocating full blocks when prefix caching is enabled.""" """Verify allocating full blocks when prefix caching is enabled."""
block_size = 4 block_size = 4
max_seqs = 10 max_seqs = 10
max_model_len = 80 max_model_len = 80
max_num_batched_tokens = 64 max_num_batched_tokens = 64
scheduler_config = SchedulerConfig(max_num_batched_tokens, scheduler_config = SchedulerConfig(
max_seqs, max_num_batched_tokens,
max_model_len, max_seqs,
enable_chunked_prefill=True) max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, cache_config = CacheConfig(block_size,
1.0, 1.0,
1, 1,
......
...@@ -3,7 +3,8 @@ from collections import deque ...@@ -3,7 +3,8 @@ from collections import deque
from typing import List, Set, Tuple from typing import List, Set, Tuple
from unittest.mock import MagicMock from unittest.mock import MagicMock
import pytest # noqa import pytest
from torch import Use # noqa
from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
from vllm.core.interfaces import AllocStatus from vllm.core.interfaces import AllocStatus
...@@ -16,9 +17,11 @@ from .utils import (append_new_token, append_new_token_seq_group, ...@@ -16,9 +17,11 @@ from .utils import (append_new_token, append_new_token_seq_group,
schedule_and_update_computed_tokens) schedule_and_update_computed_tokens)
def test_scheduler_add_seq_group(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_add_seq_group(use_v2_block_manager: bool):
block_size = 4 block_size = 4
scheduler_config = SchedulerConfig(100, 64, 1) scheduler_config = SchedulerConfig(
100, 64, 1, use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto") cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto")
cache_config.num_cpu_blocks = 4 cache_config.num_cpu_blocks = 4
cache_config.num_gpu_blocks = 4 cache_config.num_gpu_blocks = 4
...@@ -27,14 +30,18 @@ def test_scheduler_add_seq_group(): ...@@ -27,14 +30,18 @@ def test_scheduler_add_seq_group():
# Add seq group to scheduler. # Add seq group to scheduler.
num_seq_group = 4 num_seq_group = 4
for i in range(num_seq_group): for i in range(num_seq_group):
_, seq_group = create_dummy_prompt(str(i), block_size) _, seq_group = create_dummy_prompt(str(i),
block_size,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
assert scheduler.get_num_unfinished_seq_groups() == i + 1 assert scheduler.get_num_unfinished_seq_groups() == i + 1
def test_scheduler_abort_seq_group(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_abort_seq_group(use_v2_block_manager: bool):
block_size = 4 block_size = 4
scheduler_config = SchedulerConfig(100, 64, 1) scheduler_config = SchedulerConfig(
100, 64, 1, use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 4 cache_config.num_cpu_blocks = 4
cache_config.num_gpu_blocks = 4 cache_config.num_gpu_blocks = 4
...@@ -54,11 +61,16 @@ def test_scheduler_abort_seq_group(): ...@@ -54,11 +61,16 @@ def test_scheduler_abort_seq_group():
assert scheduler.get_num_unfinished_seq_groups() == 0 assert scheduler.get_num_unfinished_seq_groups() == 0
def test_scheduler_schedule_simple(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_schedule_simple(use_v2_block_manager: bool):
block_size = 4 block_size = 4
num_seq_group = 4 num_seq_group = 4
max_model_len = 16 max_model_len = 16
scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len) scheduler_config = SchedulerConfig(
64,
num_seq_group,
max_model_len,
use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8 cache_config.num_gpu_blocks = 8
...@@ -67,7 +79,9 @@ def test_scheduler_schedule_simple(): ...@@ -67,7 +79,9 @@ def test_scheduler_schedule_simple():
# Add seq groups to scheduler. # Add seq groups to scheduler.
for i in range(num_seq_group): for i in range(num_seq_group):
_, seq_group = create_dummy_prompt(str(i), prompt_length=block_size) _, seq_group = create_dummy_prompt(str(i),
prompt_length=block_size,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
running.append(seq_group) running.append(seq_group)
...@@ -91,20 +105,24 @@ def test_scheduler_schedule_simple(): ...@@ -91,20 +105,24 @@ def test_scheduler_schedule_simple():
append_new_token(out, 1) append_new_token(out, 1)
def test_scheduler_prefill_prioritized(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_prefill_prioritized(use_v2_block_manager: bool):
"""Verify running batched tokens are not applied to prefill requests.""" """Verify running batched tokens are not applied to prefill requests."""
block_size = 4 block_size = 4
max_model_len = 30 max_model_len = 30
max_batched_num_tokens = 30 max_batched_num_tokens = 30
scheduler_config = SchedulerConfig(max_batched_num_tokens, 2, scheduler_config = SchedulerConfig(
max_model_len) max_batched_num_tokens,
2,
max_model_len,
use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 2 cache_config.num_cpu_blocks = 16
cache_config.num_gpu_blocks = 2 cache_config.num_gpu_blocks = 16
scheduler = Scheduler(scheduler_config, cache_config, None) scheduler = Scheduler(scheduler_config, cache_config, None)
# Add seq groups to scheduler. # Add seq groups to scheduler.
_, seq_group_a = create_dummy_prompt("1", 1) _, seq_group_a = create_dummy_prompt("1", 1, block_size=block_size)
scheduler.add_seq_group(seq_group_a) scheduler.add_seq_group(seq_group_a)
# Schedule seq groups prompts. # Schedule seq groups prompts.
...@@ -112,7 +130,7 @@ def test_scheduler_prefill_prioritized(): ...@@ -112,7 +130,7 @@ def test_scheduler_prefill_prioritized():
assert get_sequence_groups(out) == [seq_group_a] assert get_sequence_groups(out) == [seq_group_a]
# Add a new prefill request B. # Add a new prefill request B.
_, seq_group_b = create_dummy_prompt("2", 30) _, seq_group_b = create_dummy_prompt("2", 30, block_size=block_size)
scheduler.add_seq_group(seq_group_b) scheduler.add_seq_group(seq_group_b)
# Verify prefill requests are prioritized. Since max_batched_num_tokens # Verify prefill requests are prioritized. Since max_batched_num_tokens
...@@ -121,18 +139,24 @@ def test_scheduler_prefill_prioritized(): ...@@ -121,18 +139,24 @@ def test_scheduler_prefill_prioritized():
assert get_sequence_groups(out) == [seq_group_b] assert get_sequence_groups(out) == [seq_group_b]
def test_scheduler_schedule_preempt_abort(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_schedule_preempt_abort(use_v2_block_manager: bool):
block_size = 4 block_size = 4
max_model_len = 16 max_model_len = 16
scheduler_config = SchedulerConfig(64, 2, max_model_len) scheduler_config = SchedulerConfig(
64, 2, max_model_len, use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 2 cache_config.num_cpu_blocks = 2
cache_config.num_gpu_blocks = 2 cache_config.num_gpu_blocks = 2
scheduler = Scheduler(scheduler_config, cache_config, None) scheduler = Scheduler(scheduler_config, cache_config, None)
# Add seq groups to scheduler. # Add seq groups to scheduler.
seq_a, seq_group_a = create_dummy_prompt("1", block_size) seq_a, seq_group_a = create_dummy_prompt("1",
seq_b, seq_group_b = create_dummy_prompt("2", block_size) block_size,
block_size=block_size)
seq_b, seq_group_b = create_dummy_prompt("2",
block_size,
block_size=block_size)
scheduler.add_seq_group(seq_group_a) scheduler.add_seq_group(seq_group_a)
scheduler.add_seq_group(seq_group_b) scheduler.add_seq_group(seq_group_b)
...@@ -170,12 +194,17 @@ def test_scheduler_schedule_preempt_abort(): ...@@ -170,12 +194,17 @@ def test_scheduler_schedule_preempt_abort():
assert scheduler.get_num_unfinished_seq_groups() == 1 assert scheduler.get_num_unfinished_seq_groups() == 1
def test_scheduler_max_seqs(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_max_seqs(use_v2_block_manager: bool):
block_size = 4 block_size = 4
num_seq_group = 4 num_seq_group = 4
max_seq_group = 2 max_seq_group = 2
max_model_len = 16 max_model_len = 16
scheduler_config = SchedulerConfig(64, max_seq_group, max_model_len) scheduler_config = SchedulerConfig(
64,
max_seq_group,
max_model_len,
use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8 cache_config.num_gpu_blocks = 8
...@@ -184,7 +213,9 @@ def test_scheduler_max_seqs(): ...@@ -184,7 +213,9 @@ def test_scheduler_max_seqs():
all_seq_groups: List[SequenceGroup] = [] all_seq_groups: List[SequenceGroup] = []
# Add seq groups to scheduler. # Add seq groups to scheduler.
for i in range(num_seq_group): for i in range(num_seq_group):
_, seq_group = create_dummy_prompt(str(i), prompt_length=block_size) _, seq_group = create_dummy_prompt(str(i),
prompt_length=block_size,
block_size=block_size)
all_seq_groups.append(seq_group) all_seq_groups.append(seq_group)
# Append 1 seq group # Append 1 seq group
...@@ -211,9 +242,15 @@ def test_scheduler_max_seqs(): ...@@ -211,9 +242,15 @@ def test_scheduler_max_seqs():
assert set(get_sequence_groups(out)) == set([all_seq_groups[1]]) assert set(get_sequence_groups(out)) == set([all_seq_groups[1]])
def test_scheduler_delay_factor(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_delay_factor(use_v2_block_manager: bool):
block_size = 4 block_size = 4
scheduler_config = SchedulerConfig(100, 64, 16, delay_factor=0.5) scheduler_config = SchedulerConfig(
100,
64,
16,
delay_factor=0.5,
use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8 cache_config.num_gpu_blocks = 8
...@@ -221,7 +258,8 @@ def test_scheduler_delay_factor(): ...@@ -221,7 +258,8 @@ def test_scheduler_delay_factor():
# schedule first prompt # schedule first prompt
seq_group_meta, seq_group = create_dummy_prompt("0", seq_group_meta, seq_group = create_dummy_prompt("0",
prompt_length=block_size) prompt_length=block_size,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
assert out.num_prefill_groups > 0 assert out.num_prefill_groups > 0
...@@ -231,7 +269,8 @@ def test_scheduler_delay_factor(): ...@@ -231,7 +269,8 @@ def test_scheduler_delay_factor():
# wait for a second before scheduling next prompt # wait for a second before scheduling next prompt
time.sleep(1) time.sleep(1)
seq_group_meta, seq_group = create_dummy_prompt("1", seq_group_meta, seq_group = create_dummy_prompt("1",
prompt_length=block_size) prompt_length=block_size,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
# second prompt should *not* be scheduled # second prompt should *not* be scheduled
...@@ -248,11 +287,20 @@ def test_scheduler_delay_factor(): ...@@ -248,11 +287,20 @@ def test_scheduler_delay_factor():
append_new_token(out, 1) append_new_token(out, 1)
def test_swapped_out_prioritized(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
scheduler = initialize_scheduler(max_num_seqs=6) def test_swapped_out_prioritized(use_v2_block_manager: bool):
block_size = 4
scheduler = initialize_scheduler(max_num_seqs=6,
block_size=block_size,
use_v2_block_manager=use_v2_block_manager,
num_cpu_blocks=64,
num_gpu_blocks=64)
# best_of=2 * 3 == 6 sequences. # best_of=2 * 3 == 6 sequences.
for i in range(3): for i in range(3):
_, seq_group = create_dummy_prompt(str(i), prompt_length=60, best_of=2) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
best_of=2,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
# prefill scheduled now. # prefill scheduled now.
...@@ -276,7 +324,10 @@ def test_swapped_out_prioritized(): ...@@ -276,7 +324,10 @@ def test_swapped_out_prioritized():
append_new_token(out, 1) append_new_token(out, 1)
# Add 1 more task. Swap should be prioritized over prefill. # Add 1 more task. Swap should be prioritized over prefill.
_, seq_group = create_dummy_prompt(str(i), prompt_length=60, best_of=2) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
best_of=2,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
append_new_token(out, 1) append_new_token(out, 1)
...@@ -287,17 +338,26 @@ def test_swapped_out_prioritized(): ...@@ -287,17 +338,26 @@ def test_swapped_out_prioritized():
assert out.blocks_to_swap_out == [] assert out.blocks_to_swap_out == []
def initialize_scheduler(*, def initialize_scheduler(
max_num_seqs=1000, *,
max_token_budget=1000, max_num_seqs=1000,
max_model_len=1000, max_token_budget=1000,
lora_config=None): max_model_len=1000,
block_size = 4 lora_config=None,
scheduler_config = SchedulerConfig(max_token_budget, max_num_seqs, use_v2_block_manager=False,
max_model_len) block_size=4,
num_cpu_blocks=8,
num_gpu_blocks=8,
):
block_size = block_size
scheduler_config = SchedulerConfig(
max_token_budget,
max_num_seqs,
max_model_len,
use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = num_cpu_blocks
cache_config.num_gpu_blocks = 8 cache_config.num_gpu_blocks = num_gpu_blocks
scheduler = Scheduler(scheduler_config, cache_config, lora_config) scheduler = Scheduler(scheduler_config, cache_config, lora_config)
return scheduler return scheduler
...@@ -319,12 +379,18 @@ def add_token_budget(budget: SchedulingBudget, ...@@ -319,12 +379,18 @@ def add_token_budget(budget: SchedulingBudget,
budget.add_num_seqs(mock_seq_group.request_id, num_curr_seqs) budget.add_num_seqs(mock_seq_group.request_id, num_curr_seqs)
def test_prefill_schedule_max_prompt_len(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_prefill_schedule_max_prompt_len(use_v2_block_manager: bool):
""" """
Test prompt longer than max_prompt_len is aborted. Test prompt longer than max_prompt_len is aborted.
""" """
scheduler = initialize_scheduler(max_model_len=30) block_size = 4
_, seq_group = create_dummy_prompt("0", prompt_length=60) scheduler = initialize_scheduler(max_model_len=30,
use_v2_block_manager=use_v2_block_manager,
block_size=block_size)
_, seq_group = create_dummy_prompt("0",
prompt_length=60,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
budget = create_token_budget() budget = create_token_budget()
output = scheduler._schedule_prefills(budget, None) output = scheduler._schedule_prefills(budget, None)
...@@ -336,14 +402,21 @@ def test_prefill_schedule_max_prompt_len(): ...@@ -336,14 +402,21 @@ def test_prefill_schedule_max_prompt_len():
assert len(remaining_waiting) == 0 assert len(remaining_waiting) == 0
def test_prefill_schedule_token_budget(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_prefill_schedule_token_budget(use_v2_block_manager: bool):
""" """
Test token budget respected. Test token budget respected.
""" """
scheduler = initialize_scheduler() block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_cpu_blocks=64,
num_gpu_blocks=64)
budget = create_token_budget(token_budget=0) budget = create_token_budget(token_budget=0)
for i in range(2): for i in range(2):
_, seq_group = create_dummy_prompt(str(i), prompt_length=60) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
# 0 token budget == nothing is scheduled. # 0 token budget == nothing is scheduled.
...@@ -366,10 +439,15 @@ def test_prefill_schedule_token_budget(): ...@@ -366,10 +439,15 @@ def test_prefill_schedule_token_budget():
assert len(remaining_waiting) == 1 assert len(remaining_waiting) == 1
# Test when current_batched_tokens respected. # Test when current_batched_tokens respected.
scheduler = initialize_scheduler() scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_cpu_blocks=16,
num_gpu_blocks=16)
budget = create_token_budget(token_budget=60) budget = create_token_budget(token_budget=60)
add_token_budget(budget, 30, 0) add_token_budget(budget, 30, 0)
_, seq_group = create_dummy_prompt(str(i), prompt_length=60) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
block_size=block_size)
# Cannot schedule a prompt that doesn't fit the budget. # Cannot schedule a prompt that doesn't fit the budget.
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
output = scheduler._schedule_prefills(budget, None) output = scheduler._schedule_prefills(budget, None)
...@@ -389,14 +467,21 @@ def test_prefill_schedule_token_budget(): ...@@ -389,14 +467,21 @@ def test_prefill_schedule_token_budget():
assert len(remaining_waiting) == 0 assert len(remaining_waiting) == 0
def test_prefill_schedule_max_seqs(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_prefill_schedule_max_seqs(use_v2_block_manager: bool):
""" """
Test max seq respected. Test max seq respected.
""" """
scheduler = initialize_scheduler() block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_cpu_blocks=64,
num_gpu_blocks=64)
budget = create_token_budget(max_num_seqs=2) budget = create_token_budget(max_num_seqs=2)
for i in range(3): for i in range(3):
_, seq_group = create_dummy_prompt(str(i), prompt_length=60) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
output = scheduler._schedule_prefills(budget, None) output = scheduler._schedule_prefills(budget, None)
remaining_waiting = scheduler.waiting remaining_waiting = scheduler.waiting
...@@ -410,7 +495,9 @@ def test_prefill_schedule_max_seqs(): ...@@ -410,7 +495,9 @@ def test_prefill_schedule_max_seqs():
scheduler.waiting = deque() scheduler.waiting = deque()
budget = create_token_budget(max_num_seqs=2) budget = create_token_budget(max_num_seqs=2)
add_token_budget(budget, 0, 2) add_token_budget(budget, 0, 2)
_, seq_group = create_dummy_prompt(str(i), prompt_length=60) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
output = scheduler._schedule_prefills(budget, None) output = scheduler._schedule_prefills(budget, None)
remaining_waiting = scheduler.waiting remaining_waiting = scheduler.waiting
...@@ -421,17 +508,24 @@ def test_prefill_schedule_max_seqs(): ...@@ -421,17 +508,24 @@ def test_prefill_schedule_max_seqs():
assert len(remaining_waiting) == 1 assert len(remaining_waiting) == 1
def test_prefill_schedule_max_lora(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_prefill_schedule_max_lora(use_v2_block_manager: bool):
""" """
Test max lora is respected and prioritized. Test max lora is respected and prioritized.
""" """
block_size = 4
lora_config = LoRAConfig(max_lora_rank=8, max_loras=1) lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
scheduler = initialize_scheduler(lora_config=lora_config) scheduler = initialize_scheduler(lora_config=lora_config,
use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_cpu_blocks=64,
num_gpu_blocks=64)
budget = create_token_budget(token_budget=120) budget = create_token_budget(token_budget=120)
curr_loras: Set[int] = set() curr_loras: Set[int] = set()
for i in range(2): for i in range(2):
_, seq_group = create_dummy_prompt(str(i), _, seq_group = create_dummy_prompt(str(i),
prompt_length=60, prompt_length=60,
block_size=block_size,
lora_request=LoRARequest( lora_request=LoRARequest(
lora_name=str(i), lora_name=str(i),
lora_int_id=i + 1, lora_int_id=i + 1,
...@@ -443,7 +537,9 @@ def test_prefill_schedule_max_lora(): ...@@ -443,7 +537,9 @@ def test_prefill_schedule_max_lora():
# If a request is not scheduled because it hits max lora, it is # If a request is not scheduled because it hits max lora, it is
# prioritized. Verify that. # prioritized. Verify that.
for i in range(2, 4): for i in range(2, 4):
_, seq_group = create_dummy_prompt(str(i), prompt_length=60) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
# Schedule 2 requests (0 and 2) # Schedule 2 requests (0 and 2)
output = scheduler._schedule_prefills(budget, curr_loras) output = scheduler._schedule_prefills(budget, curr_loras)
...@@ -467,14 +563,21 @@ def test_prefill_schedule_max_lora(): ...@@ -467,14 +563,21 @@ def test_prefill_schedule_max_lora():
assert budget.num_batched_tokens == 60 assert budget.num_batched_tokens == 60
def test_prefill_schedule_no_block_manager_capacity(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_prefill_schedule_no_block_manager_capacity(use_v2_block_manager):
""" """
Test sequence cannot be scheduled due to block manager has no capacity. Test sequence cannot be scheduled due to block manager has no capacity.
""" """
scheduler = initialize_scheduler() block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_gpu_blocks=128,
num_cpu_blocks=128)
budget = create_token_budget() budget = create_token_budget()
for i in range(3): for i in range(3):
_, seq_group = create_dummy_prompt(str(i), prompt_length=60) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
scheduler.block_manager.can_allocate = MagicMock() scheduler.block_manager.can_allocate = MagicMock()
scheduler.block_manager.can_allocate.return_value = AllocStatus.LATER scheduler.block_manager.can_allocate.return_value = AllocStatus.LATER
...@@ -489,7 +592,9 @@ def test_prefill_schedule_no_block_manager_capacity(): ...@@ -489,7 +592,9 @@ def test_prefill_schedule_no_block_manager_capacity():
scheduler = initialize_scheduler() scheduler = initialize_scheduler()
budget = create_token_budget() budget = create_token_budget()
for i in range(3): for i in range(3):
_, seq_group = create_dummy_prompt(str(i), prompt_length=60) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
block_size=block_size)
scheduler.add_seq_group(seq_group) scheduler.add_seq_group(seq_group)
scheduler.block_manager.can_allocate = MagicMock() scheduler.block_manager.can_allocate = MagicMock()
scheduler.block_manager.can_allocate.return_value = AllocStatus.NEVER scheduler.block_manager.can_allocate.return_value = AllocStatus.NEVER
...@@ -502,14 +607,21 @@ def test_prefill_schedule_no_block_manager_capacity(): ...@@ -502,14 +607,21 @@ def test_prefill_schedule_no_block_manager_capacity():
assert len(remaining_waiting) == 0 assert len(remaining_waiting) == 0
def test_decode_schedule_preempted(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_decode_schedule_preempted(use_v2_block_manager: bool):
""" """
Test decodes cannot be scheduled and preempted. Test decodes cannot be scheduled and preempted.
""" """
scheduler = initialize_scheduler() block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_cpu_blocks=64,
num_gpu_blocks=64)
curr_loras = None curr_loras = None
for i in range(3): for i in range(3):
_, seq_group = create_dummy_prompt(str(i), prompt_length=60) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
block_size=block_size)
scheduler._allocate_and_set_running(seq_group) scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1) append_new_token_seq_group(60, seq_group, 1)
scheduler._add_seq_group_to_running(seq_group) scheduler._add_seq_group_to_running(seq_group)
...@@ -541,15 +653,23 @@ def test_decode_schedule_preempted(): ...@@ -541,15 +653,23 @@ def test_decode_schedule_preempted():
assert output.blocks_to_copy == [] assert output.blocks_to_copy == []
def test_decode_swap_beam_search(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_decode_swap_beam_search(use_v2_block_manager: bool):
""" """
Test best_of > 1 swap out blocks Test best_of > 1 swap out blocks
""" """
scheduler = initialize_scheduler() block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_gpu_blocks=64,
num_cpu_blocks=64)
curr_loras = None curr_loras = None
budget = create_token_budget() budget = create_token_budget()
for i in range(3): for i in range(3):
_, seq_group = create_dummy_prompt(str(i), prompt_length=60, best_of=2) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
best_of=2,
block_size=block_size)
scheduler._allocate_and_set_running(seq_group) scheduler._allocate_and_set_running(seq_group)
scheduler._add_seq_group_to_running(seq_group) scheduler._add_seq_group_to_running(seq_group)
append_new_token_seq_group(60, seq_group, 1) append_new_token_seq_group(60, seq_group, 1)
...@@ -589,12 +709,20 @@ def test_decode_swap_beam_search(): ...@@ -589,12 +709,20 @@ def test_decode_swap_beam_search():
assert output.blocks_to_copy == [] assert output.blocks_to_copy == []
def test_schedule_decode_blocks_to_copy_update(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_schedule_decode_blocks_to_copy_update(use_v2_block_manager: bool):
""" """
Verify blocks_to_copy is updated. Verify blocks_to_copy is updated.
""" """
scheduler = initialize_scheduler() block_size = 4
_, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=4,
num_cpu_blocks=16,
num_gpu_blocks=16)
_, seq_group = create_dummy_prompt("1",
prompt_length=60,
best_of=2,
block_size=block_size)
curr_loras = None curr_loras = None
scheduler._allocate_and_set_running(seq_group) scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1) append_new_token_seq_group(60, seq_group, 1)
...@@ -619,13 +747,19 @@ def test_schedule_decode_blocks_to_copy_update(): ...@@ -619,13 +747,19 @@ def test_schedule_decode_blocks_to_copy_update():
assert output.blocks_to_copy == [(2, 3)] assert output.blocks_to_copy == [(2, 3)]
def test_schedule_swapped_simple(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
scheduler = initialize_scheduler() def test_schedule_swapped_simple(use_v2_block_manager: bool):
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size)
curr_loras = None curr_loras = None
blocks_to_swap_out: List[Tuple[int, int]] = [] blocks_to_swap_out: List[Tuple[int, int]] = []
_, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) _, seq_group = create_dummy_prompt("1",
prompt_length=4,
best_of=2,
block_size=block_size)
scheduler._allocate_and_set_running(seq_group) scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1) append_new_token_seq_group(4, seq_group, 1)
scheduler._swap_out(seq_group, blocks_to_swap_out) scheduler._swap_out(seq_group, blocks_to_swap_out)
scheduler._add_seq_group_to_swapped(seq_group) scheduler._add_seq_group_to_swapped(seq_group)
...@@ -644,12 +778,17 @@ def test_schedule_swapped_simple(): ...@@ -644,12 +778,17 @@ def test_schedule_swapped_simple():
assert blocks_to_swap_out == blocks_to_swap_in_reverse assert blocks_to_swap_out == blocks_to_swap_in_reverse
def test_schedule_swapped_max_token_budget(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
scheduler = initialize_scheduler() def test_schedule_swapped_max_token_budget(use_v2_block_manager: bool):
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_cpu_blocks=32,
num_gpu_blocks=32)
curr_loras = None curr_loras = None
blocks_to_swap_out: List[Tuple[int, int]] = [] blocks_to_swap_out: List[Tuple[int, int]] = []
for _ in range(2): for i in range(2):
_, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) _, seq_group = create_dummy_prompt(str(i), prompt_length=60, best_of=2)
scheduler._allocate_and_set_running(seq_group) scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1) append_new_token_seq_group(60, seq_group, 1)
scheduler._swap_out(seq_group, blocks_to_swap_out) scheduler._swap_out(seq_group, blocks_to_swap_out)
...@@ -676,12 +815,19 @@ def test_schedule_swapped_max_token_budget(): ...@@ -676,12 +815,19 @@ def test_schedule_swapped_max_token_budget():
assert len(output.prefill_seq_groups) == 0 assert len(output.prefill_seq_groups) == 0
def test_schedule_swapped_max_seqs(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
scheduler = initialize_scheduler() def test_schedule_swapped_max_seqs(use_v2_block_manager: bool):
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_cpu_blocks=64,
num_gpu_blocks=64)
curr_loras = None curr_loras = None
blocks_to_swap_out: List[Tuple[int, int]] = [] blocks_to_swap_out: List[Tuple[int, int]] = []
for i in range(4): for i in range(4):
_, seq_group = create_dummy_prompt(str(i), prompt_length=60) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
block_size=4)
scheduler._allocate_and_set_running(seq_group) scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1) append_new_token_seq_group(60, seq_group, 1)
scheduler._swap_out(seq_group, blocks_to_swap_out) scheduler._swap_out(seq_group, blocks_to_swap_out)
...@@ -706,14 +852,21 @@ def test_schedule_swapped_max_seqs(): ...@@ -706,14 +852,21 @@ def test_schedule_swapped_max_seqs():
assert len(output.prefill_seq_groups) == 0 assert len(output.prefill_seq_groups) == 0
def test_schedule_swapped_max_loras(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_schedule_swapped_max_loras(use_v2_block_manager: bool):
block_size = 4
lora_config = LoRAConfig(max_lora_rank=8, max_loras=1) lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
scheduler = initialize_scheduler(lora_config=lora_config) scheduler = initialize_scheduler(lora_config=lora_config,
use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_cpu_blocks=32,
num_gpu_blocks=32)
curr_loras: Set[int] = set() curr_loras: Set[int] = set()
blocks_to_swap_out: List[Tuple[int, int]] = [] blocks_to_swap_out: List[Tuple[int, int]] = []
for i in range(2): for i in range(2):
_, seq_group = create_dummy_prompt(str(i), _, seq_group = create_dummy_prompt(str(i),
prompt_length=60, prompt_length=60,
block_size=block_size,
lora_request=LoRARequest( lora_request=LoRARequest(
lora_name=str(i), lora_name=str(i),
lora_int_id=i + 1, lora_int_id=i + 1,
...@@ -734,12 +887,20 @@ def test_schedule_swapped_max_loras(): ...@@ -734,12 +887,20 @@ def test_schedule_swapped_max_loras():
assert len(curr_loras) == 1 assert len(curr_loras) == 1
def test_schedule_swapped_cannot_swap_in(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
scheduler = initialize_scheduler() def test_schedule_swapped_cannot_swap_in(use_v2_block_manager: bool):
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_cpu_blocks=32,
num_gpu_blocks=32)
curr_loras = None curr_loras = None
blocks_to_swap_out: List[Tuple[int, int]] = [] blocks_to_swap_out: List[Tuple[int, int]] = []
for _ in range(2): for i in range(2):
_, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
best_of=2,
block_size=block_size)
scheduler._allocate_and_set_running(seq_group) scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1) append_new_token_seq_group(60, seq_group, 1)
scheduler._swap_out(seq_group, blocks_to_swap_out) scheduler._swap_out(seq_group, blocks_to_swap_out)
...@@ -759,12 +920,20 @@ def test_schedule_swapped_cannot_swap_in(): ...@@ -759,12 +920,20 @@ def test_schedule_swapped_cannot_swap_in():
assert len(output.prefill_seq_groups) == 0 assert len(output.prefill_seq_groups) == 0
def test_infeasible_swap(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
scheduler = initialize_scheduler() def test_infeasible_swap(use_v2_block_manager: bool):
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_cpu_blocks=32,
num_gpu_blocks=32)
curr_loras = None curr_loras = None
blocks_to_swap_out: List[Tuple[int, int]] = [] blocks_to_swap_out: List[Tuple[int, int]] = []
for _ in range(2): for i in range(2):
_, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) _, seq_group = create_dummy_prompt(str(i),
prompt_length=60,
best_of=2,
block_size=block_size)
scheduler._allocate_and_set_running(seq_group) scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1) append_new_token_seq_group(60, seq_group, 1)
scheduler._swap_out(seq_group, blocks_to_swap_out) scheduler._swap_out(seq_group, blocks_to_swap_out)
...@@ -785,10 +954,18 @@ def test_infeasible_swap(): ...@@ -785,10 +954,18 @@ def test_infeasible_swap():
assert len(output.prefill_seq_groups) == 0 assert len(output.prefill_seq_groups) == 0
def test_schedule_swapped_blocks_to_copy(): @pytest.mark.parametrize('use_v2_block_manager', [True, False])
scheduler = initialize_scheduler() def test_schedule_swapped_blocks_to_copy(use_v2_block_manager: bool):
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_cpu_blocks=32,
num_gpu_blocks=32)
curr_loras = None curr_loras = None
_, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) _, seq_group = create_dummy_prompt("1",
prompt_length=60,
best_of=2,
block_size=block_size)
scheduler._allocate_and_set_running(seq_group) scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1) append_new_token_seq_group(60, seq_group, 1)
blocks_to_swap_out: List[Tuple[int, int]] = [] blocks_to_swap_out: List[Tuple[int, int]] = []
......
...@@ -8,6 +8,8 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node ...@@ -8,6 +8,8 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
import os import os
import pytest import pytest
from packaging import version
from transformers import __version__ as transformers_version
from vllm.logger import init_logger from vllm.logger import init_logger
...@@ -37,6 +39,7 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1" ...@@ -37,6 +39,7 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
(1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "mp"), (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "mp"),
(1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "mp"), (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "mp"),
(1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "mp"), (1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "mp"),
(1, 2, 0, 1, 0, "Qwen/Qwen2-VL-2B-Instruct", "mp")
], ],
) )
@fork_new_process_for_each_test @fork_new_process_for_each_test
...@@ -46,6 +49,11 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, ...@@ -46,6 +49,11 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL,
pytest.skip("Skipping multi-node pipeline parallel test for " pytest.skip("Skipping multi-node pipeline parallel test for "
"multiprocessing distributed backend") "multiprocessing distributed backend")
# Skip tests that require transformers>=4.45.0
if "Qwen2-VL" in MODEL_NAME and version.parse(
transformers_version) < version.parse("4.45.0.dev0"):
pytest.skip("This test requires transformers>=4.45.0")
pp_args = [ pp_args = [
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",
......
"""E2E tests to verify the correctness of the encoder-decoder framework
Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
"""
from typing import List, Optional, Tuple
import pytest
from transformers import AutoModelForSeq2SeqLM
from vllm.sequence import SampleLogprobs
from vllm.utils import is_cpu
from ..conftest import DecoderPromptType
from ..models.utils import check_logprobs_close
def vllm_to_hf_output(
vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]],
decoder_prompt_type: DecoderPromptType,
):
"""Sanitize vllm output to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
hf_output_str = output_str + "</s>"
if decoder_prompt_type == DecoderPromptType.NONE:
hf_output_str = "<s>" + hf_output_str
return output_ids, hf_output_str, out_logprobs
@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
@pytest.mark.parametrize("enforce_eager", [True, False])
@pytest.mark.skipif(
is_cpu(),
reason="CPU backend is not currently supported with encoder/decoder models"
)
def test_encoder_decoder_e2e(
hf_runner,
vllm_runner,
example_encoder_decoder_prompts,
model: str,
dtype: str,
max_tokens: int,
num_logprobs: int,
decoder_prompt_type: DecoderPromptType,
enforce_eager: bool,
) -> None:
'''
End-to-End (E2E) test for the encoder-decoder framework.
This test evaluates the encoder-decoder functionality using the BART
model. We compare the outputs of the Hugging Face and vLLM
implementations to ensure that both implementations produce consistent
and correct results.
'''
test_case_prompts = example_encoder_decoder_prompts[decoder_prompt_type]
# Configuration settings for HF baseline
hf_kwargs = {
"top_k": None,
"num_beams": 1,
"repetition_penalty": 1.0,
"top_p": 1.0,
"length_penalty": 1.0,
"early_stopping": False,
"no_repeat_ngram_size": None,
"min_length": 0
}
with hf_runner(model, dtype=dtype,
auto_cls=AutoModelForSeq2SeqLM) as hf_model:
hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
test_case_prompts,
max_tokens,
num_logprobs,
**hf_kwargs,
))
with vllm_runner(model, dtype=dtype,
enforce_eager=enforce_eager) as vllm_model:
vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
test_case_prompts, max_tokens, num_logprobs)
hf_skip_tokens = (1
if decoder_prompt_type == DecoderPromptType.NONE else 0)
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=[
vllm_to_hf_output(vllm_output, decoder_prompt_type)
for vllm_output in vllm_outputs
],
name_0="hf",
name_1="vllm",
num_outputs_0_skip_tokens=hf_skip_tokens,
)
from argparse import ArgumentTypeError
import pytest import pytest
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs, nullable_kvs
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
...@@ -13,6 +15,10 @@ from vllm.utils import FlexibleArgumentParser ...@@ -13,6 +15,10 @@ from vllm.utils import FlexibleArgumentParser
"image": 16, "image": 16,
"video": 2 "video": 2
}), }),
("Image=16, Video=2", {
"image": 16,
"video": 2
}),
]) ])
def test_limit_mm_per_prompt_parser(arg, expected): def test_limit_mm_per_prompt_parser(arg, expected):
parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
...@@ -22,3 +28,36 @@ def test_limit_mm_per_prompt_parser(arg, expected): ...@@ -22,3 +28,36 @@ def test_limit_mm_per_prompt_parser(arg, expected):
args = parser.parse_args(["--limit-mm-per-prompt", arg]) args = parser.parse_args(["--limit-mm-per-prompt", arg])
assert args.limit_mm_per_prompt == expected assert args.limit_mm_per_prompt == expected
@pytest.mark.parametrize(
("arg"),
[
"image", # Missing =
"image=4,image=5", # Conflicting values
"image=video=4" # Too many = in tokenized arg
])
def test_bad_nullable_kvs(arg):
with pytest.raises(ArgumentTypeError):
nullable_kvs(arg)
@pytest.mark.parametrize(("arg", "expected"), [
(None, None),
("{}", {}),
('{"num_crops": 4}', {
"num_crops": 4
}),
('{"foo": {"bar": "baz"}}', {
"foo": {
"bar": "baz"
}
}),
])
def test_mm_processor_kwargs_prompt_parser(arg, expected):
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
if arg is None:
args = parser.parse_args([])
else:
args = parser.parse_args(["--mm-processor-kwargs", arg])
assert args.mm_processor_kwargs == expected
...@@ -162,6 +162,41 @@ def test_chat(): ...@@ -162,6 +162,41 @@ def test_chat():
assert len(outputs) == 1 assert len(outputs) == 1
def test_multi_chat():
llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
prompt1 = "Explain the concept of entropy."
prompt2 = "Explain what among us is."
conversation1 = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": prompt1
},
]
conversation2 = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": prompt2
},
]
messages = [conversation1, conversation2]
outputs = llm.chat(messages)
assert len(outputs) == 2
@pytest.mark.parametrize("image_urls", @pytest.mark.parametrize("image_urls",
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]]) [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
def test_chat_multi_image(image_urls: List[str]): def test_chat_multi_image(image_urls: List[str]):
......
import asyncio
import tempfile
import unittest
import unittest.mock
import uuid
import pytest
import pytest_asyncio
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.openai.rpc.client import (AsyncEngineRPCClient,
RPCClientClosedError)
from vllm.entrypoints.openai.rpc.server import AsyncEngineRPCServer
@pytest.fixture(scope="function")
def tmp_socket():
with tempfile.TemporaryDirectory() as td:
yield f"ipc://{td}/{uuid.uuid4()}"
@pytest_asyncio.fixture(scope="function")
async def dummy_server(tmp_socket, monkeypatch):
dummy_engine = unittest.mock.AsyncMock()
def dummy_engine_builder(*args, **kwargs):
return dummy_engine
with monkeypatch.context() as m:
m.setattr(AsyncLLMEngine, "from_engine_args", dummy_engine_builder)
server = AsyncEngineRPCServer(None, None, rpc_path=tmp_socket)
loop = asyncio.get_running_loop()
server_task = loop.create_task(server.run_server_loop())
try:
yield server
finally:
server_task.cancel()
server.cleanup()
@pytest_asyncio.fixture(scope="function")
async def client(tmp_socket):
client = AsyncEngineRPCClient(rpc_path=tmp_socket)
# Sanity check: the server is connected
await client._wait_for_server_rpc()
try:
yield client
finally:
client.close()
@pytest.mark.asyncio
async def test_client_data_methods_use_timeouts(monkeypatch, dummy_server,
client: AsyncEngineRPCClient):
with monkeypatch.context() as m:
# Make the server _not_ reply with a model config
m.setattr(dummy_server, "get_config", lambda x: None)
m.setattr(client, "_data_timeout", 10)
# And ensure the task completes anyway
# (client.setup() invokes server.get_config())
client_task = asyncio.get_running_loop().create_task(client.setup())
with pytest.raises(TimeoutError, match="Server didn't reply within"):
await asyncio.wait_for(client_task, timeout=0.05)
@pytest.mark.asyncio
async def test_client_aborts_use_timeouts(monkeypatch, dummy_server,
client: AsyncEngineRPCClient):
with monkeypatch.context() as m:
# Hang all abort requests
m.setattr(dummy_server, "abort", lambda x: None)
m.setattr(client, "_data_timeout", 10)
# The client should suppress timeouts on `abort`s
# and return normally, assuming the server will eventually
# abort the request.
client_task = asyncio.get_running_loop().create_task(
client.abort("test request id"))
await asyncio.wait_for(client_task, timeout=0.05)
@pytest.mark.asyncio
async def test_client_data_methods_reraise_exceptions(
monkeypatch, dummy_server, client: AsyncEngineRPCClient):
with monkeypatch.context() as m:
# Make the server raise some random exception
exception = RuntimeError("Client test exception")
def raiser():
raise exception
m.setattr(dummy_server.engine, "get_model_config", raiser)
m.setattr(client, "_data_timeout", 10)
client_task = asyncio.get_running_loop().create_task(client.setup())
# And ensure the task completes, raising the exception
with pytest.raises(RuntimeError, match=str(exception)):
await asyncio.wait_for(client_task, timeout=0.05)
@pytest.mark.asyncio
async def test_client_errors_after_closing(monkeypatch, dummy_server,
client: AsyncEngineRPCClient):
client.close()
# Healthchecks and generate requests will fail with explicit errors
with pytest.raises(RPCClientClosedError):
await client.check_health()
with pytest.raises(RPCClientClosedError):
async for _ in client.generate(None, None, None):
pass
# But no-ops like aborting will pass
await client.abort("test-request-id")
await client.do_log_stats()
...@@ -18,38 +18,36 @@ TASK = "gsm8k" ...@@ -18,38 +18,36 @@ TASK = "gsm8k"
FILTER = "exact_match,strict-match" FILTER = "exact_match,strict-match"
RTOL = 0.03 RTOL = 0.03
EXPECTED_VALUE = 0.58 EXPECTED_VALUE = 0.58
DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"]
MORE_ARGS_LIST = [
["--enable-chunked-prefill"], # Chunked
["--num-scheduler-steps", "8"], # MS
["--num-scheduler-steps", "8", "--multi-step-stream-outputs"] # MS+Stream
]
@pytest.fixture(scope="module") @pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
def server(): def test_lm_eval_accuracy(more_args):
args = [ args = list(DEFAULT_ARGS)
"--max-model-len", "4096", "--enable-chunked-prefill", args.extend(more_args)
"--disable-log-requests", "--enforce-eager"
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
def server_data(server):
return {
"url": f"{server.url_for('v1')}/completions",
}
print(f"Running with: {args}")
def test_lm_eval_accuracy(server_data): with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
model_args = (f"model={MODEL_NAME}," url = f"{remote_server.url_for('v1')}/completions"
f"base_url={server_data['url']},"
f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False") model_args = (
f"model={MODEL_NAME},"
results = lm_eval.simple_evaluate( f"base_url={url},"
model="local-completions", f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
model_args=model_args,
tasks=TASK, results = lm_eval.simple_evaluate(
) model="local-completions",
model_args=model_args,
measured_value = results["results"][TASK][FILTER] tasks=TASK,
assert (measured_value - RTOL < EXPECTED_VALUE )
and measured_value + RTOL > EXPECTED_VALUE
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}" measured_value = results["results"][TASK][FILTER]
assert (measured_value - RTOL < EXPECTED_VALUE
and measured_value + RTOL > EXPECTED_VALUE
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
...@@ -5,7 +5,7 @@ from vllm.entrypoints.chat_utils import (apply_hf_chat_template, ...@@ -5,7 +5,7 @@ from vllm.entrypoints.chat_utils import (apply_hf_chat_template,
from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
from ..utils import VLLM_PATH from ...utils import VLLM_PATH
chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja" chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
assert chatml_jinja_path.exists() assert chatml_jinja_path.exists()
......
import json
import unittest
from vllm.entrypoints.openai.cli_args import make_arg_parser
from vllm.entrypoints.openai.serving_engine import LoRAModulePath
from vllm.utils import FlexibleArgumentParser
LORA_MODULE = {
"name": "module2",
"path": "/path/to/module2",
"base_model_name": "llama"
}
class TestLoraParserAction(unittest.TestCase):
def setUp(self):
# Setting up argparse parser for tests
parser = FlexibleArgumentParser(
description="vLLM's remote OpenAI server.")
self.parser = make_arg_parser(parser)
def test_valid_key_value_format(self):
# Test old format: name=path
args = self.parser.parse_args([
'--lora-modules',
'module1=/path/to/module1',
])
expected = [LoRAModulePath(name='module1', path='/path/to/module1')]
self.assertEqual(args.lora_modules, expected)
def test_valid_json_format(self):
# Test valid JSON format input
args = self.parser.parse_args([
'--lora-modules',
json.dumps(LORA_MODULE),
])
expected = [
LoRAModulePath(name='module2',
path='/path/to/module2',
base_model_name='llama')
]
self.assertEqual(args.lora_modules, expected)
def test_invalid_json_format(self):
# Test invalid JSON format input, missing closing brace
with self.assertRaises(SystemExit):
self.parser.parse_args([
'--lora-modules',
'{"name": "module3", "path": "/path/to/module3"'
])
def test_invalid_type_error(self):
# Test type error when values are not JSON or key=value
with self.assertRaises(SystemExit):
self.parser.parse_args([
'--lora-modules',
'invalid_format' # This is not JSON or key=value format
])
def test_invalid_json_field(self):
# Test valid JSON format but missing required fields
with self.assertRaises(SystemExit):
self.parser.parse_args([
'--lora-modules',
'{"name": "module4"}' # Missing required 'path' field
])
def test_empty_values(self):
# Test when no LoRA modules are provided
args = self.parser.parse_args(['--lora-modules', ''])
self.assertEqual(args.lora_modules, [])
def test_multiple_valid_inputs(self):
# Test multiple valid inputs (both old and JSON format)
args = self.parser.parse_args([
'--lora-modules',
'module1=/path/to/module1',
json.dumps(LORA_MODULE),
])
expected = [
LoRAModulePath(name='module1', path='/path/to/module1'),
LoRAModulePath(name='module2',
path='/path/to/module2',
base_model_name='llama')
]
self.assertEqual(args.lora_modules, expected)
if __name__ == '__main__':
unittest.main()
import json
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
# downloading lora to test lora requests
from huggingface_hub import snapshot_download
from ...utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here
LORA_NAME = "typeof/zephyr-7b-beta-lora"
@pytest.fixture(scope="module")
def zephyr_lora_files():
return snapshot_download(repo_id=LORA_NAME)
@pytest.fixture(scope="module")
def server_with_lora_modules_json(zephyr_lora_files):
# Define the json format LoRA module configurations
lora_module_1 = {
"name": "zephyr-lora",
"path": zephyr_lora_files,
"base_model_name": MODEL_NAME
}
lora_module_2 = {
"name": "zephyr-lora2",
"path": zephyr_lora_files,
"base_model_name": MODEL_NAME
}
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
# lora config below
"--enable-lora",
"--lora-modules",
json.dumps(lora_module_1),
json.dumps(lora_module_2),
"--max-lora-rank",
"64",
"--max-cpu-loras",
"2",
"--max-num-seqs",
"64",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client_for_lora_lineage(server_with_lora_modules_json):
async with server_with_lora_modules_json.get_async_client(
) as async_client:
yield async_client
@pytest.mark.asyncio
async def test_check_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI,
zephyr_lora_files):
models = await client_for_lora_lineage.models.list()
models = models.data
served_model = models[0]
lora_models = models[1:]
assert served_model.id == MODEL_NAME
assert served_model.root == MODEL_NAME
assert served_model.parent is None
assert all(lora_model.root == zephyr_lora_files
for lora_model in lora_models)
assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
assert lora_models[0].id == "zephyr-lora"
assert lora_models[1].id == "zephyr-lora2"
...@@ -51,12 +51,14 @@ async def client(server): ...@@ -51,12 +51,14 @@ async def client(server):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_check_models(client: openai.AsyncOpenAI): async def test_check_models(client: openai.AsyncOpenAI, zephyr_lora_files):
models = await client.models.list() models = await client.models.list()
models = models.data models = models.data
served_model = models[0] served_model = models[0]
lora_models = models[1:] lora_models = models[1:]
assert served_model.id == MODEL_NAME assert served_model.id == MODEL_NAME
assert all(model.root == MODEL_NAME for model in models) assert served_model.root == MODEL_NAME
assert all(lora_model.root == zephyr_lora_files
for lora_model in lora_models)
assert lora_models[0].id == "zephyr-lora" assert lora_models[0].id == "zephyr-lora"
assert lora_models[1].id == "zephyr-lora2" assert lora_models[1].id == "zephyr-lora2"
import time
import pytest
from vllm.entrypoints.openai.api_server import build_async_engine_client
from vllm.entrypoints.openai.cli_args import make_arg_parser
from vllm.utils import FlexibleArgumentParser
@pytest.mark.asyncio
async def test_mp_crash_detection():
parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
parser = make_arg_parser(parser)
args = parser.parse_args([])
# use an invalid tensor_parallel_size to trigger the
# error in the server
args.tensor_parallel_size = 65536
start = time.perf_counter()
async with build_async_engine_client(args):
pass
end = time.perf_counter()
assert end - start < 60, ("Expected vLLM to gracefully shutdown in <60s "
"if there is an error in the startup.")
@pytest.mark.asyncio
async def test_mp_cuda_init():
# it should not crash, when cuda is initialized
# in the API server process
import torch
torch.cuda.init()
parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
parser = make_arg_parser(parser)
args = parser.parse_args([])
async with build_async_engine_client(args):
pass
...@@ -4,13 +4,15 @@ from dataclasses import dataclass ...@@ -4,13 +4,15 @@ from dataclasses import dataclass
from unittest.mock import MagicMock from unittest.mock import MagicMock
from vllm.config import MultiModalConfig from vllm.config import MultiModalConfig
from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.multiprocessing.client import MQLLMEngineClient
from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_engine import BaseModelPath
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
MODEL_NAME = "openai-community/gpt2" MODEL_NAME = "openai-community/gpt2"
CHAT_TEMPLATE = "Dummy chat template for testing {}" CHAT_TEMPLATE = "Dummy chat template for testing {}"
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
@dataclass @dataclass
...@@ -37,7 +39,7 @@ async def _async_serving_chat_init(): ...@@ -37,7 +39,7 @@ async def _async_serving_chat_init():
serving_completion = OpenAIServingChat(engine, serving_completion = OpenAIServingChat(engine,
model_config, model_config,
served_model_names=[MODEL_NAME], BASE_MODEL_PATHS,
response_role="assistant", response_role="assistant",
chat_template=CHAT_TEMPLATE, chat_template=CHAT_TEMPLATE,
lora_modules=None, lora_modules=None,
...@@ -52,12 +54,13 @@ def test_async_serving_chat_init(): ...@@ -52,12 +54,13 @@ def test_async_serving_chat_init():
def test_serving_chat_should_set_correct_max_tokens(): def test_serving_chat_should_set_correct_max_tokens():
mock_engine = MagicMock(spec=AsyncLLMEngine) mock_engine = MagicMock(spec=MQLLMEngineClient)
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
serving_chat = OpenAIServingChat(mock_engine, serving_chat = OpenAIServingChat(mock_engine,
MockModelConfig(), MockModelConfig(),
served_model_names=[MODEL_NAME], BASE_MODEL_PATHS,
response_role="assistant", response_role="assistant",
chat_template=CHAT_TEMPLATE, chat_template=CHAT_TEMPLATE,
lora_modules=None, lora_modules=None,
......
...@@ -4,13 +4,14 @@ from unittest.mock import MagicMock ...@@ -4,13 +4,14 @@ from unittest.mock import MagicMock
import pytest import pytest
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.engine.protocol import AsyncEngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.openai.protocol import (ErrorResponse, from vllm.entrypoints.openai.protocol import (ErrorResponse,
LoadLoraAdapterRequest, LoadLoraAdapterRequest,
UnloadLoraAdapterRequest) UnloadLoraAdapterRequest)
from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
MODEL_NAME = "meta-llama/Llama-2-7b" MODEL_NAME = "meta-llama/Llama-2-7b"
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
LORA_LOADING_SUCCESS_MESSAGE = ( LORA_LOADING_SUCCESS_MESSAGE = (
"Success: LoRA adapter '{lora_name}' added successfully.") "Success: LoRA adapter '{lora_name}' added successfully.")
LORA_UNLOADING_SUCCESS_MESSAGE = ( LORA_UNLOADING_SUCCESS_MESSAGE = (
...@@ -18,14 +19,14 @@ LORA_UNLOADING_SUCCESS_MESSAGE = ( ...@@ -18,14 +19,14 @@ LORA_UNLOADING_SUCCESS_MESSAGE = (
async def _async_serving_engine_init(): async def _async_serving_engine_init():
mock_engine_client = MagicMock(spec=AsyncEngineClient) mock_engine_client = MagicMock(spec=EngineClient)
mock_model_config = MagicMock(spec=ModelConfig) mock_model_config = MagicMock(spec=ModelConfig)
# Set the max_model_len attribute to avoid missing attribute # Set the max_model_len attribute to avoid missing attribute
mock_model_config.max_model_len = 2048 mock_model_config.max_model_len = 2048
serving_engine = OpenAIServing(mock_engine_client, serving_engine = OpenAIServing(mock_engine_client,
mock_model_config, mock_model_config,
served_model_names=[MODEL_NAME], BASE_MODEL_PATHS,
lora_modules=None, lora_modules=None,
prompt_adapters=None, prompt_adapters=None,
request_logger=None) request_logger=None)
......
...@@ -44,5 +44,5 @@ async def test_shutdown_on_engine_failure(tmp_path): ...@@ -44,5 +44,5 @@ async def test_shutdown_on_engine_failure(tmp_path):
prompt="Hello, my name is") prompt="Hello, my name is")
# Now the server should shut down # Now the server should shut down
return_code = remote_server.proc.wait(timeout=3) return_code = remote_server.proc.wait(timeout=8)
assert return_code is not None assert return_code is not None
...@@ -7,6 +7,7 @@ from tests.kernels.utils import opcheck ...@@ -7,6 +7,7 @@ from tests.kernels.utils import opcheck
from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul, from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
NewGELU, QuickGELU, NewGELU, QuickGELU,
SiluAndMul) SiluAndMul)
from vllm.utils import seed_everything
from .allclose_default import get_default_atol, get_default_rtol from .allclose_default import get_default_atol, get_default_rtol
...@@ -34,9 +35,7 @@ def test_act_and_mul( ...@@ -34,9 +35,7 @@ def test_act_and_mul(
seed: int, seed: int,
device: str, device: str,
) -> None: ) -> None:
torch.random.manual_seed(seed) seed_everything(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
x = torch.randn(num_tokens, 2 * d, dtype=dtype) x = torch.randn(num_tokens, 2 * d, dtype=dtype)
if activation == "silu": if activation == "silu":
...@@ -77,9 +76,7 @@ def test_activation( ...@@ -77,9 +76,7 @@ def test_activation(
seed: int, seed: int,
device: str, device: str,
) -> None: ) -> None:
torch.random.manual_seed(seed) seed_everything(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
x = torch.randn(num_tokens, d, dtype=dtype) x = torch.randn(num_tokens, d, dtype=dtype)
layer = activation[0]() layer = activation[0]()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment