Commit 2216a4e5 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge remote-tracking branch 'mirror/main'

parents ad385667 51c24c97
from typing import List, Type
import pytest
import torch.nn.functional as F
from ....conftest import IMAGE_ASSETS
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ....utils import large_gpu_test
from ..utils import check_embeddings_close
HF_TEXT_PROMPTS = [
# T -> X
"Find me an everyday image that matches the given caption: The label of the object is stop sign", # noqa: E501
# T -> X
"Retrieve an image of this caption: cherry blossom",
]
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
# T + I -> X
"stop_sign":
"<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign", # noqa: E501
# I -> X
"cherry_blossom":
"<|image_1|> Represent the given image with the following question: What is in the image", # noqa: E501
"<|image_1|> Represent the given image for classification", # noqa: E501
})
MODELS = ["TIGER-Lab/VLM2Vec-Full"]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_models(
hf_runner,
vllm_runner,
example_prompts,
def _run_test(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
input_texts: List[str],
input_images: PromptImageInput,
model: str,
*,
dtype: str,
) -> None:
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(model,
max_model_len=4096,
max_num_seqs=2,
dtype=dtype,
with vllm_runner(model, task="embedding", dtype=dtype,
enforce_eager=True) as vllm_model:
vllm_outputs = vllm_model.encode(example_prompts)
vllm_outputs = vllm_model.encode(input_texts, images=input_images)
with hf_runner(model, dtype=dtype) as hf_model:
all_inputs = hf_model.get_inputs(example_prompts)
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
hf_model_kwargs = {"_attn_implementation": "eager"}
with hf_runner(model, dtype=dtype,
model_kwargs=hf_model_kwargs) as hf_model:
all_inputs = hf_model.get_inputs(input_texts, images=input_images)
all_outputs = []
for inputs in all_inputs:
......@@ -60,3 +72,53 @@ def test_models(
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_models_text(
hf_runner,
vllm_runner,
image_assets,
model: str,
dtype: str,
) -> None:
input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
input_texts = [text for text, _ in input_texts_images]
input_images = [image for _, image in input_texts_images]
_run_test(
hf_runner,
vllm_runner,
input_texts,
input_images, # type: ignore
model,
dtype=dtype,
)
@large_gpu_test(min_gb=48)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_models_image(
hf_runner,
vllm_runner,
image_assets,
model: str,
dtype: str,
) -> None:
input_texts_images = [
(text, asset.pil_image)
for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
]
input_texts = [text for text, _ in input_texts_images]
input_images = [image for _, image in input_texts_images]
_run_test(
hf_runner,
vllm_runner,
input_texts,
input_images,
model,
dtype=dtype,
)
......@@ -3,10 +3,10 @@ from typing import Dict, List, Optional, Sequence, Tuple, Union
import torch
from vllm.config import ModelConfig
from vllm.config import ModelConfig, TaskOption
from vllm.inputs import InputContext
from vllm.platforms import current_platform
from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
from vllm.utils import is_cpu
TokensText = Tuple[List[int], str]
......@@ -19,7 +19,7 @@ def check_outputs_equal(
name_1: str,
):
"""
Compare the two sequences generated by different models,
Compare the two sequences generated by different models,
which should be equal.
"""
assert len(outputs_0_lst) == len(outputs_1_lst)
......@@ -248,13 +248,14 @@ def check_logprobs_close(
def build_model_context(model_name: str,
task: TaskOption = "auto",
tokenizer_name: Optional[str] = None,
trust_remote_code: bool = False,
dtype: Optional[Union[str, torch.dtype]] = None,
mm_processor_kwargs: Optional[Dict] = None,
limit_mm_per_prompt: Optional[Dict] = None):
"""Creates an InputContext for a given model.
Args:
model_name: Name of the model being considered.
tokenizer_name: Name of the tokenizer being considered.
......@@ -269,11 +270,12 @@ def build_model_context(model_name: str,
if tokenizer_name is None:
tokenizer_name = model_name
if dtype is None:
dtype = "bfloat16" if is_cpu() else "half"
dtype = "bfloat16" if current_platform.is_cpu() else "half"
model_config = ModelConfig(
model_name,
tokenizer_name,
task=task,
tokenizer=tokenizer_name,
tokenizer_mode="auto",
trust_remote_code=trust_remote_code,
dtype=dtype,
......
......@@ -59,15 +59,7 @@ async def test_evil_forward(tmp_socket):
await asyncio.sleep(2.0)
await client.check_health()
# Throws an error in first forward pass.
with pytest.raises(RAISED_ERROR):
async for _ in client.generate(prompt="Hello my name is",
sampling_params=SamplingParams(),
request_id=uuid.uuid4()):
pass
assert client.errored
# Engine is errored, should get ENGINE_DEAD_ERROR.
# Throws an error that should get ENGINE_DEAD_ERROR.
with pytest.raises(MQEngineDeadError):
async for _ in client.generate(prompt="Hello my name is",
sampling_params=SamplingParams(),
......@@ -149,7 +141,7 @@ async def test_failed_abort(tmp_socket):
client = await engine.make_client()
assert client.is_running
# Firsh check health should work.
# First check health should work.
await client.check_health()
# Trigger an abort on the client side.
......@@ -174,6 +166,45 @@ async def test_failed_abort(tmp_socket):
client.close()
@pytest.mark.asyncio
async def test_batch_error(tmp_socket):
with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
ipc_path=tmp_socket,
run_fn=run_with_evil_abort) as engine:
client = await engine.make_client()
assert client.is_running
# First check health should work.
await client.check_health()
# Batch of requests
async def do_generate(client):
# min_tokens=2048 to keep busy the engine busy
# to get enough time to get process a request
# that will crash the engine
params = SamplingParams(min_tokens=2048, max_tokens=2048)
async for _ in client.generate(prompt="Hello my name is",
sampling_params=params,
request_id=uuid.uuid4()):
pass
tasks = [asyncio.create_task(do_generate(client)) for _ in range(10)]
# This request will force a processing batch to raise
# an exception and next the engine get errored
await client.abort(request_id="foo")
# The batch of those request failed, then they
# should get the same exception as a MQEngineDeadError.
errors = await asyncio.gather(*tasks, return_exceptions=True)
for e in errors:
assert isinstance(e, MQEngineDeadError)
assert "KeyError" in repr(e)
client.close()
@pytest.mark.asyncio
async def test_bad_request(tmp_socket):
with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
......
......@@ -24,6 +24,7 @@ def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor):
model_config = ModelConfig(
model=MODEL_NAME,
task="auto",
tokenizer=MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
......@@ -67,6 +68,7 @@ def test_llava_next_image_processor(image_assets, mm_registry, dtype,
model_config = ModelConfig(
model=MODEL_NAME,
task="auto",
tokenizer=MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
......@@ -109,6 +111,7 @@ def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
model_config = ModelConfig(
model=MODEL_NAME,
task="auto",
tokenizer=MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
......@@ -139,6 +142,7 @@ def test_image_mapper_multi(image_assets, mm_registry, num_images):
model_config = ModelConfig(
model=MODEL_NAME,
task="auto",
tokenizer=MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
......
......@@ -221,6 +221,7 @@ def test_max_tokens_kwarg_overrides(num_crops):
expected_seq_count = DEFAULT_NUM_CROPS if num_crops is None else num_crops
ctx = build_model_context(MULTIMODAL_MODEL_ID,
task="generate",
trust_remote_code=True,
mm_processor_kwargs=mm_processor_kwargs,
limit_mm_per_prompt={"image": 1})
......@@ -256,6 +257,7 @@ def test_max_tokens_kwarg_overrides(num_crops):
def test_max_tokens_with_sad_kwarg_overrides(mm_processor_kwargs):
"""Ensure that max token calcs filters out invalid mm_processor_kwargs"""
ctx = build_model_context(MULTIMODAL_MODEL_ID,
task="generate",
trust_remote_code=True,
mm_processor_kwargs=mm_processor_kwargs,
limit_mm_per_prompt={"image": 1})
......@@ -278,12 +280,13 @@ def test_max_tokens_with_sad_kwarg_overrides(mm_processor_kwargs):
### Test overrides for the mapper
@pytest.mark.parametrize("num_crops", [DEFAULT_NUM_CROPS, NUM_CROPS_OVERRIDE])
def test_default_mapper_with_processer_kwargs(image_assets, num_crops):
def test_default_mapper_with_processor_kwargs(image_assets, num_crops):
"""Ensure that the mapper processor kwargs can fall back to HF models."""
# NOTE - we don't validate bad inputs for the default mapper, because it's
# through the automodel interface in transformers, so we can't easily
# inspect what kwargs are or are not allowed.
ctx = build_model_context(MULTIMODAL_MODEL_ID,
task="generate",
trust_remote_code=True,
mm_processor_kwargs={"num_crops": num_crops},
limit_mm_per_prompt={"image": 1})
......@@ -311,6 +314,7 @@ def test_custom_mapper_kwarg_overrides(image_assets, init_num_crops,
init_num_crops, inference_num_crops)
ctx = build_model_context(MULTIMODAL_MODEL_ID,
task="generate",
trust_remote_code=True,
mm_processor_kwargs=init_kwargs,
limit_mm_per_prompt={"image": 1})
......@@ -348,6 +352,7 @@ def test_custom_mapper_with_sad_kwarg_overrides(image_assets,
"""Ensure that custom mappers filters out invalid mm_processor_kwargs"""
# Should filter out the init time kwargs
ctx = build_model_context(MULTIMODAL_MODEL_ID,
task="generate",
trust_remote_code=True,
mm_processor_kwargs=mm_processor_kwargs,
limit_mm_per_prompt={"image": 1})
......
......@@ -4,8 +4,8 @@ Run `pytest tests/prefix_caching/test_prefix_caching.py`.
"""
import pytest
from tests.conftest import cleanup
from vllm import LLM
from vllm.distributed import cleanup_dist_env_and_memory
MODEL_LEN_LEN = [
# Example models with sliding window.
......@@ -31,7 +31,7 @@ def test_disable_sliding_window(model_len_len, ):
model_config.max_model_len)
del vllm_disabled_model
cleanup()
cleanup_dist_env_and_memory()
vllm_enabled_model = LLM(model, disable_sliding_window=False)
vllm_enabled_model.generate("Hi my name is")
......@@ -41,4 +41,4 @@ def test_disable_sliding_window(model_len_len, ):
model_config.max_model_len)
del vllm_enabled_model
cleanup()
cleanup_dist_env_and_memory()
......@@ -107,8 +107,7 @@ def validate_generated_texts(hf_runner,
quantization='bitsandbytes',
load_format='bitsandbytes',
tensor_parallel_size=vllm_tp_size,
enforce_eager=False,
gpu_memory_utilization=0.8) as llm:
enforce_eager=False) as llm:
vllm_outputs = llm.generate_greedy(prompts, 8)
vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")
......
......@@ -57,7 +57,8 @@ def test_auto_gptq(model_arg_exptype: Tuple[str, None, str]) -> None:
try:
model_config = ModelConfig(model_path,
model_path,
task="auto",
tokenizer=model_path,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
......
......@@ -4,10 +4,10 @@ from typing import List, Optional, Sequence, Tuple, Union
import pytest
from vllm import LLM, SamplingParams
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.model_executor.utils import set_random_seed
from vllm.sequence import PromptLogprobs, SampleLogprobs
from ...conftest import cleanup
from ...models.utils import (TokensTextLogprobs,
TokensTextLogprobsPromptLogprobs,
check_logprobs_close, check_outputs_equal)
......@@ -44,7 +44,7 @@ def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
yield llm
del llm
cleanup()
cleanup_dist_env_and_memory()
return generate
......
import contextlib
import functools
import gc
from typing import Callable, TypeVar
import pytest
import ray
import torch
from typing_extensions import ParamSpec
from vllm.distributed import (destroy_distributed_environment,
destroy_model_parallel)
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
@pytest.fixture(autouse=True)
def cleanup():
destroy_model_parallel()
destroy_distributed_environment()
with contextlib.suppress(AssertionError):
torch.distributed.destroy_process_group()
ray.shutdown()
gc.collect()
torch.cuda.empty_cache()
cleanup_dist_env_and_memory(shutdown_ray=True)
_P = ParamSpec("_P")
......
......@@ -2,6 +2,42 @@ import pytest
from vllm.config import ModelConfig
@pytest.mark.parametrize(("model_id", "expected_task"), [
("facebook/opt-125m", "generate"),
("intfloat/e5-mistral-7b-instruct", "embedding"),
])
def test_auto_task(model_id, expected_task):
config = ModelConfig(
model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
)
assert config.task == expected_task
@pytest.mark.parametrize(("model_id", "bad_task"), [
("facebook/opt-125m", "embedding"),
("intfloat/e5-mistral-7b-instruct", "generate"),
])
def test_incorrect_task(model_id, bad_task):
with pytest.raises(ValueError, match=r"does not support the .* task"):
ModelConfig(
model_id,
task=bad_task,
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
)
MODEL_IDS_EXPECTED = [
("Qwen/Qwen1.5-7B", 32768),
("mistralai/Mistral-7B-v0.1", 4096),
......@@ -14,7 +50,8 @@ def test_disable_sliding_window(model_id_expected):
model_id, expected = model_id_expected
model_config = ModelConfig(
model_id,
model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
......@@ -32,7 +69,8 @@ def test_get_sliding_window():
# when use_sliding_window is False.
qwen2_model_config = ModelConfig(
"Qwen/Qwen1.5-7B",
"Qwen/Qwen1.5-7B",
task="auto",
tokenizer="Qwen/Qwen1.5-7B",
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
......@@ -49,7 +87,8 @@ def test_get_sliding_window():
mistral_model_config = ModelConfig(
"mistralai/Mistral-7B-v0.1",
"mistralai/Mistral-7B-v0.1",
task="auto",
tokenizer="mistralai/Mistral-7B-v0.1",
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
......@@ -70,7 +109,8 @@ def test_rope_customization():
llama_model_config = ModelConfig(
"meta-llama/Meta-Llama-3-8B-Instruct",
"meta-llama/Meta-Llama-3-8B-Instruct",
task="auto",
tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
tokenizer_mode="auto",
trust_remote_code=False,
dtype="float16",
......@@ -82,7 +122,8 @@ def test_rope_customization():
llama_model_config = ModelConfig(
"meta-llama/Meta-Llama-3-8B-Instruct",
"meta-llama/Meta-Llama-3-8B-Instruct",
task="auto",
tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
tokenizer_mode="auto",
trust_remote_code=False,
dtype="float16",
......@@ -98,7 +139,8 @@ def test_rope_customization():
longchat_model_config = ModelConfig(
"lmsys/longchat-13b-16k",
"lmsys/longchat-13b-16k",
task="auto",
tokenizer="lmsys/longchat-13b-16k",
tokenizer_mode="auto",
trust_remote_code=False,
dtype="float16",
......@@ -112,7 +154,8 @@ def test_rope_customization():
longchat_model_config = ModelConfig(
"lmsys/longchat-13b-16k",
"lmsys/longchat-13b-16k",
task="auto",
tokenizer="lmsys/longchat-13b-16k",
tokenizer_mode="auto",
trust_remote_code=False,
dtype="float16",
......
......@@ -32,5 +32,5 @@ def test_scalar_type_min_max(type_tuple):
max = torch.iinfo(torch_type).max
print(t, min, max, t.min(), t.max())
assert min == t.min()
assert max == t.max()
assert min == t.min(), f"min: {min} != {t.min()}"
assert max == t.max(), f"max: {max} != {t.max()}"
......@@ -46,9 +46,10 @@ def test_filter_subtensors():
@pytest.fixture(scope="module")
def llama_2_7b_files():
with TemporaryDirectory() as cache_dir:
input_dir = snapshot_download("meta-llama/Llama-2-7b-hf",
input_dir = snapshot_download("meta-llama/Llama-3.2-1B",
cache_dir=cache_dir,
ignore_patterns="*.bin*")
ignore_patterns=["*.bin*", "original/*"])
yield input_dir
......@@ -58,9 +59,12 @@ def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
# Dump worker states to output directory
llm_sharded_writer.llm_engine.model_executor.save_sharded_state(
path=output_dir)
# Copy metadata files to output directory
for file in os.listdir(input_dir):
if not any(file.endswith(ext) for ext in weights_patterns):
if not any(
file.endswith(ext) and not os.path.isdir(file)
for ext in weights_patterns):
shutil.copy(f"{input_dir}/{file}", output_dir)
......
......@@ -59,7 +59,7 @@ def test_deprecate_kwargs_always():
with pytest.warns(DeprecationWarning, match="'old_arg'"):
dummy(old_arg=1)
with error_on_warning():
with error_on_warning(DeprecationWarning):
dummy(new_arg=1)
......@@ -69,10 +69,10 @@ def test_deprecate_kwargs_never():
def dummy(*, old_arg: object = None, new_arg: object = None):
pass
with error_on_warning():
with error_on_warning(DeprecationWarning):
dummy(old_arg=1)
with error_on_warning():
with error_on_warning(DeprecationWarning):
dummy(new_arg=1)
......@@ -86,15 +86,15 @@ def test_deprecate_kwargs_dynamic():
with pytest.warns(DeprecationWarning, match="'old_arg'"):
dummy(old_arg=1)
with error_on_warning():
with error_on_warning(DeprecationWarning):
dummy(new_arg=1)
is_deprecated = False
with error_on_warning():
with error_on_warning(DeprecationWarning):
dummy(old_arg=1)
with error_on_warning():
with error_on_warning(DeprecationWarning):
dummy(new_arg=1)
......
import json
from typing import Generator, List, Optional
import partial_json_parser
import pytest
from partial_json_parser.core.options import Allow
from vllm.entrypoints.openai.protocol import (DeltaMessage, FunctionCall,
ToolCall)
from vllm.entrypoints.openai.tool_parsers import JambaToolParser
from vllm.transformers_utils.detokenizer import detokenize_incrementally
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
MODEL = "ai21labs/Jamba-tiny-dev"
@pytest.fixture(scope="module")
def jamba_tokenizer():
return get_tokenizer(tokenizer_name=MODEL)
@pytest.fixture
def jamba_tool_parser(jamba_tokenizer):
return JambaToolParser(jamba_tokenizer)
def assert_tool_calls(actual_tool_calls: List[ToolCall],
expected_tool_calls: List[ToolCall]):
assert len(actual_tool_calls) == len(expected_tool_calls)
for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
expected_tool_calls):
assert isinstance(actual_tool_call.id, str)
assert len(actual_tool_call.id) > 16
assert actual_tool_call.type == "function"
assert actual_tool_call.function == expected_tool_call.function
def stream_delta_message_generator(
jamba_tool_parser: JambaToolParser, jamba_tokenizer: AnyTokenizer,
model_output: str) -> Generator[DeltaMessage, None, None]:
all_token_ids = jamba_tokenizer.encode(model_output,
add_special_tokens=False)
previous_text = ""
previous_tokens = None
prefix_offset = 0
read_offset = 0
for i, delta_token in enumerate(all_token_ids):
delta_token_ids = [delta_token]
previous_token_ids = all_token_ids[:i]
current_token_ids = all_token_ids[:i + 1]
(new_tokens, delta_text, new_prefix_offset,
new_read_offset) = detokenize_incrementally(
tokenizer=jamba_tokenizer,
all_input_ids=current_token_ids,
prev_tokens=previous_tokens,
prefix_offset=prefix_offset,
read_offset=read_offset,
skip_special_tokens=False,
spaces_between_special_tokens=True,
)
current_text = previous_text + delta_text
delta_message = jamba_tool_parser.extract_tool_calls_streaming(
previous_text,
current_text,
delta_text,
previous_token_ids,
current_token_ids,
delta_token_ids,
request=None, # type: ignore[arg-type]
)
if delta_message:
yield delta_message
previous_text = current_text
previous_tokens = previous_tokens + new_tokens if previous_tokens\
else new_tokens
prefix_offset = new_prefix_offset
read_offset = new_read_offset
def test_extract_tool_calls_no_tools(jamba_tool_parser):
model_output = "This is a test"
extracted_tool_calls = jamba_tool_parser.extract_tool_calls(
model_output, request=None) # type: ignore[arg-type]
assert not extracted_tool_calls.tools_called
assert extracted_tool_calls.tool_calls == []
assert extracted_tool_calls.content == model_output
@pytest.mark.parametrize(
ids=[
"single_tool",
"single_tool_with_content",
"parallel_tools",
],
argnames=["model_output", "expected_tool_calls", "expected_content"],
argvalues=[
(
''' <tool_calls>[\n {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>''', # noqa: E501
[
ToolCall(function=FunctionCall(name="get_current_weather",
arguments=json.dumps(
{
"city": "Dallas",
"state": "TX",
"unit": "fahrenheit"
})))
],
None),
(
''' Sure! let me call the tool for you.<tool_calls>[\n {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>''', # noqa: E501
[
ToolCall(function=FunctionCall(name="get_current_weather",
arguments=json.dumps(
{
"city": "Dallas",
"state": "TX",
"unit": "fahrenheit"
})))
],
" Sure! let me call the tool for you."),
(
''' <tool_calls>[\n {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}},\n {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}\n]</tool_calls>''', # noqa: E501
[
ToolCall(function=FunctionCall(name="get_current_weather",
arguments=json.dumps(
{
"city": "Dallas",
"state": "TX",
"unit": "fahrenheit"
}))),
ToolCall(function=FunctionCall(name="get_current_weather",
arguments=json.dumps(
{
"city": "Orlando",
"state": "FL",
"unit": "fahrenheit"
})))
],
None)
],
)
def test_extract_tool_calls(jamba_tool_parser, model_output,
expected_tool_calls, expected_content):
extracted_tool_calls = jamba_tool_parser.extract_tool_calls(
model_output, request=None) # type: ignore[arg-type]
assert extracted_tool_calls.tools_called
assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
assert extracted_tool_calls.content == expected_content
@pytest.mark.parametrize(
ids=[
"no_tools",
"single_tool",
"single_tool_with_content",
"parallel_tools",
],
argnames=["model_output", "expected_tool_calls", "expected_content"],
argvalues=[
('''This is a test''', [], '''This is a test'''),
(
''' <tool_calls>[\n {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>''', # noqa: E501
[
ToolCall(function=FunctionCall(name="get_current_weather",
arguments=json.dumps(
{
"city": "Dallas",
"state": "TX",
"unit": "fahrenheit"
})))
],
" "),
(
''' Sure! let me call the tool for you.<tool_calls>[\n {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>''', # noqa: E501
[
ToolCall(function=FunctionCall(name="get_current_weather",
arguments=json.dumps(
{
"city": "Dallas",
"state": "TX",
"unit": "fahrenheit"
})))
],
" Sure! let me call the tool for you."),
(
''' <tool_calls>[\n {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}},\n {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}\n]</tool_calls>''', # noqa: E501
[
ToolCall(function=FunctionCall(name="get_current_weather",
arguments=json.dumps(
{
"city": "Dallas",
"state": "TX",
"unit": "fahrenheit"
}))),
ToolCall(function=FunctionCall(name="get_current_weather",
arguments=json.dumps(
{
"city": "Orlando",
"state": "FL",
"unit": "fahrenheit"
})))
],
" ")
],
)
def test_extract_tool_calls_streaming(jamba_tool_parser, jamba_tokenizer,
model_output, expected_tool_calls,
expected_content):
other_content: str = ''
function_names: List[str] = []
function_args_strs: List[str] = []
tool_call_idx: int = -1
tool_call_ids: List[Optional[str]] = []
for delta_message in stream_delta_message_generator(
jamba_tool_parser, jamba_tokenizer, model_output):
# role should never be streamed from tool parser
assert not delta_message.role
if delta_message.content:
other_content += delta_message.content
streamed_tool_calls = delta_message.tool_calls
if streamed_tool_calls and len(streamed_tool_calls) > 0:
# make sure only one diff is present - correct even for parallel
assert len(streamed_tool_calls) == 1
tool_call = streamed_tool_calls[0]
# if a new tool is being called, set up empty arguments
if tool_call.index != tool_call_idx:
tool_call_idx = tool_call.index
function_args_strs.append("")
tool_call_ids.append(None)
# if a tool call ID is streamed, make sure one hasn't been already
if tool_call.id and not tool_call_ids[tool_call.index]:
tool_call_ids[tool_call.index] = tool_call.id
# if parts of the function start being streamed
if tool_call.function:
# if the function name is defined, set it. it should be streamed
# IN ENTIRETY, exactly one time.
if tool_call.function.name:
assert isinstance(tool_call.function.name, str)
function_names.append(tool_call.function.name)
if tool_call.function.arguments:
# make sure they're a string and then add them to the list
assert isinstance(tool_call.function.arguments, str)
function_args_strs[
tool_call.index] += tool_call.function.arguments
assert other_content == expected_content
actual_tool_calls = [
ToolCall(id=tool_call_id,
function=FunctionCall(
name=function_name,
arguments=partial_json_parser.ensure_json(
function_args_str, Allow.OBJ | Allow.STR)))
for tool_call_id, function_name, function_args_str in zip(
tool_call_ids, function_names, function_args_strs)
]
assert_tool_calls(actual_tool_calls, expected_tool_calls)
......@@ -87,8 +87,19 @@ def test_traces(trace_service):
f"The fake trace service didn't receive a trace within "
f"the {timeout} seconds timeout")
attributes = decode_attributes(trace_service.request.resource_spans[0].
scope_spans[0].spans[0].attributes)
request = trace_service.request
assert len(request.resource_spans) == 1, (
f"Expected 1 resource span, "
f"but got {len(request.resource_spans)}")
assert len(request.resource_spans[0].scope_spans) == 1, (
f"Expected 1 scope span, "
f"but got {len(request.resource_spans[0].scope_spans)}")
assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
f"Expected 1 span, "
f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
attributes = decode_attributes(
request.resource_spans[0].scope_spans[0].spans[0].attributes)
assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model
assert attributes.get(
SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id
......@@ -142,8 +153,19 @@ def test_traces_with_detailed_steps(trace_service):
f"The fake trace service didn't receive a trace within "
f"the {timeout} seconds timeout")
attributes = decode_attributes(trace_service.request.resource_spans[0].
scope_spans[0].spans[0].attributes)
request = trace_service.request
assert len(request.resource_spans) == 1, (
f"Expected 1 resource span, "
f"but got {len(request.resource_spans)}")
assert len(request.resource_spans[0].scope_spans) == 1, (
f"Expected 1 scope span, "
f"but got {len(request.resource_spans[0].scope_spans)}")
assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
f"Expected 1 span, "
f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
attributes = decode_attributes(
request.resource_spans[0].scope_spans[0].spans[0].attributes)
assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model
assert attributes.get(
SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id
......
......@@ -8,7 +8,7 @@ import time
import warnings
from contextlib import contextmanager
from pathlib import Path
from typing import Any, Callable, Dict, List, Literal, Optional, Union
from typing import Any, Callable, Dict, List, Literal, Optional, Type, Union
import openai
import pytest
......@@ -454,13 +454,13 @@ def multi_process_parallel(
@contextmanager
def error_on_warning():
def error_on_warning(category: Type[Warning] = Warning):
"""
Within the scope of this context manager, tests will fail if any warning
is emitted.
of the given category is emitted.
"""
with warnings.catch_warnings():
warnings.simplefilter("error")
warnings.filterwarnings("error", category=category)
yield
......@@ -587,7 +587,7 @@ def large_gpu_test(*, min_gb: int):
)
def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
return test_skipif(fork_new_process_for_each_test(f))
return test_skipif(f)
return wrapper
......
......@@ -5,8 +5,9 @@ import pytest
import torch
from vllm.engine.arg_utils import EngineArgs
from vllm.platforms import current_platform
from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
from vllm.utils import is_cpu, make_tensor_with_pad
from vllm.utils import make_tensor_with_pad
from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
from vllm.worker.model_runner import _get_graph_batch_size
......@@ -31,7 +32,7 @@ def _create_model_runner(model: str, *args,
return model_runner
@pytest.mark.skipif(condition=is_cpu(),
@pytest.mark.skipif(condition=current_platform.is_cpu(),
reason="CPU backend is currently "
"unsupported for encoder/ "
"decoder models")
......@@ -74,7 +75,7 @@ def test_empty_seq_group():
assert return_seq_lens is None
@pytest.mark.skipif(condition=is_cpu(),
@pytest.mark.skipif(condition=current_platform.is_cpu(),
reason="CPU backend is currently "
"unsupported for encoder/ "
"decoder models")
......@@ -264,7 +265,7 @@ def test_prepare_prompt(batch_size):
assert torch.equal(actual, expected)
@pytest.mark.skipif(condition=is_cpu(),
@pytest.mark.skipif(condition=current_platform.is_cpu(),
reason="CPU backend is currently "
"unsupported for encoder/ "
"decoder models")
......@@ -490,7 +491,7 @@ def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group):
"""
Tests that for encoder-decoder models with CUDA Graph capture and replay
enabled, the tensors used during the decode phase are correctly padded
enabled, the tensors used during the decode phase are correctly padded
for varying input batch sizes.
"""
model_runner = _create_model_runner(
......
import torch
from vllm.engine.arg_utils import EngineArgs
from vllm.utils import get_distributed_init_method, get_ip, get_open_port
from vllm.worker.cache_engine import CacheEngine
from vllm.worker.worker import Worker
def test_gpu_memory_profiling():
# Tests the gpu profiling that happens in order to determine the number of
# KV cache blocks that we can allocate on the GPU.
# This test mocks the maximum available gpu memory so that it can run on
# any gpu setup.
# Set up engine args to build a worker.
engine_args = EngineArgs(model="facebook/opt-125m",
dtype="half",
load_format="dummy")
engine_config = engine_args.create_engine_config()
engine_config.cache_config.num_gpu_blocks = 1000
engine_config.cache_config.num_cpu_blocks = 1000
# Create the worker.
distributed_init_method = get_distributed_init_method(
get_ip(), get_open_port())
worker = Worker(
model_config=engine_config.model_config,
parallel_config=engine_config.parallel_config,
scheduler_config=engine_config.scheduler_config,
device_config=engine_config.device_config,
cache_config=engine_config.cache_config,
load_config=engine_config.load_config,
local_rank=0,
rank=0,
distributed_init_method=distributed_init_method,
is_driver_worker=True,
)
# Load the model so we can profile it
worker.init_device()
worker.load_model()
# Set 10GiB as the total gpu ram to be device-agnostic
def mock_mem_info():
current_usage = torch.cuda.memory_stats(
)["allocated_bytes.all.current"]
mock_total_bytes = 10 * 1024**3
free = mock_total_bytes - current_usage
return (free, mock_total_bytes)
from unittest.mock import patch
with patch("torch.cuda.mem_get_info", side_effect=mock_mem_info):
gpu_blocks, _ = worker.determine_num_available_blocks()
# Peak vram usage by torch should be 0.7077 GiB
# No memory should be allocated outside of torch
# 9.0 GiB should be the utilization target
# 8.2923 GiB should be available for the KV cache
block_size = CacheEngine.get_cache_block_size(
engine_config.cache_config, engine_config.model_config,
engine_config.parallel_config)
expected_blocks = (8.2923 * 1024**3) // block_size
# Check within a small tolerance for portability
# Hardware, kernel, or dependency changes could all affect memory
# utilization.
# A 10 block tolerance here should be about 6MB of wiggle room.
assert abs(gpu_blocks - expected_blocks) < 10
......@@ -2,6 +2,10 @@
CI=${1:-0}
if [ $CI -eq 1 ]; then
set -e
fi
run_mypy() {
echo "Running mypy on $1"
if [ $CI -eq 1 ] && [ -z "$1" ]; then
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment