Commit 6d2051cc authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.3.post1' into v0.6.3.post1-dev

parents 2c7f740a a2c71c54
......@@ -131,19 +131,22 @@ def create_seq_group_metadata_from_prompts(
for i, final_len in enumerate(final_prompt_lens)
}
return [
SequenceGroupMetadata(
request_id=str(i),
is_prompt=len(cont_token_ids) == 0,
seq_data={
i: SequenceData.from_seqs(prompt_token_ids[:],
cont_token_ids[:]),
},
sampling_params=SamplingParams(temperature=0.0, ),
block_tables={i: block_allocations[i][:]},
) for i, (prompt_token_ids,
cont_token_ids) in enumerate(zip(prompts, continuations))
]
seq_grou_metadata_list = []
for i, (prompt_token_ids,
cont_token_ids) in enumerate(zip(prompts, continuations)):
data = SequenceData.from_seqs(prompt_token_ids, cont_token_ids)
data.update_num_computed_tokens(
len(prompt_token_ids) + len(cont_token_ids) - 1)
seq_data = {i: data}
seq_grou_metadata_list.append(
SequenceGroupMetadata(
request_id=str(i),
is_prompt=len(cont_token_ids) == 0,
seq_data=seq_data,
sampling_params=SamplingParams(temperature=0.0),
block_tables={i: block_allocations[i][:]},
))
return seq_grou_metadata_list
def assert_logprobs_dict_allclose(
......
......@@ -64,9 +64,9 @@ def test_get_sliding_window():
def test_rope_customization():
TEST_ROPE_SCALING = {"type": "dynamic", "factor": 2.0}
TEST_ROPE_SCALING = {"rope_type": "dynamic", "factor": 2.0}
TEST_ROPE_THETA = 16_000_000.0
LONGCHAT_ROPE_SCALING = {"type": "linear", "factor": 8.0}
LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0}
llama_model_config = ModelConfig(
"meta-llama/Meta-Llama-3-8B-Instruct",
......
......@@ -2,6 +2,7 @@ from typing import List
import pytest
from vllm.inputs import zip_enc_dec_prompts
from vllm.inputs.parse import parse_and_batch_prompt
STRING_INPUTS = [
......@@ -51,3 +52,28 @@ def test_parse_single_batch_token_consistent(token_input: List[int]):
def test_parse_single_batch_string_slice(inputs_slice: slice):
assert parse_and_batch_prompt(STRING_INPUTS)[inputs_slice] \
== parse_and_batch_prompt(STRING_INPUTS[inputs_slice])
# yapf: disable
@pytest.mark.parametrize('mm_processor_kwargs,expected_mm_kwargs', [
(None, [{}, {}]),
({}, [{}, {}]),
({"foo": 100}, [{"foo": 100}, {"foo": 100}]),
([{"foo": 100}, {"bar": 200}], [{"foo": 100}, {"bar": 200}]),
])
# yapf: enable
def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
"""Test mm_processor_kwargs init for zipping enc/dec prompts."""
encoder_prompts = ['An encoder prompt', 'Another encoder prompt']
decoder_prompts = ['A decoder prompt', 'Another decoder prompt']
zipped_prompts = zip_enc_dec_prompts(encoder_prompts, decoder_prompts,
mm_processor_kwargs)
assert len(zipped_prompts) == len(encoder_prompts) == len(decoder_prompts)
for enc, dec, exp_kwargs, zipped in zip(encoder_prompts, decoder_prompts,
expected_mm_kwargs,
zipped_prompts):
assert isinstance(zipped, dict)
assert len(zipped.keys()) == 3
assert zipped['encoder_prompt'] == enc
assert zipped['decoder_prompt'] == dec
assert zipped['mm_processor_kwargs'] == exp_kwargs
......@@ -7,7 +7,7 @@ from typing import AsyncIterator, Tuple
import pytest
from vllm.utils import (FlexibleArgumentParser, deprecate_kwargs,
get_open_port, merge_async_iterators)
get_open_port, merge_async_iterators, supports_kw)
from .utils import error_on_warning
......@@ -136,6 +136,8 @@ def parser():
def parser_with_config():
parser = FlexibleArgumentParser()
parser.add_argument('serve')
parser.add_argument('model_tag')
parser.add_argument('--served-model-name', type=str)
parser.add_argument('--config', type=str)
parser.add_argument('--port', type=int)
parser.add_argument('--tensor-parallel-size', type=int)
......@@ -190,33 +192,77 @@ def test_missing_required_argument(parser):
def test_cli_override_to_config(parser_with_config):
args = parser_with_config.parse_args([
'serve', '--config', './data/test_config.yaml',
'serve', 'mymodel', '--config', './data/test_config.yaml',
'--tensor-parallel-size', '3'
])
assert args.tensor_parallel_size == 3
args = parser_with_config.parse_args([
'serve', '--tensor-parallel-size', '3', '--config',
'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
'./data/test_config.yaml'
])
assert args.tensor_parallel_size == 3
assert args.port == 12312
args = parser_with_config.parse_args([
'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
'./data/test_config.yaml', '--port', '666'
])
assert args.tensor_parallel_size == 3
assert args.port == 666
def test_config_args(parser_with_config):
args = parser_with_config.parse_args(
['serve', '--config', './data/test_config.yaml'])
['serve', 'mymodel', '--config', './data/test_config.yaml'])
assert args.tensor_parallel_size == 2
def test_config_file(parser_with_config):
with pytest.raises(FileNotFoundError):
parser_with_config.parse_args(['serve', '--config', 'test_config.yml'])
parser_with_config.parse_args(
['serve', 'mymodel', '--config', 'test_config.yml'])
with pytest.raises(ValueError):
parser_with_config.parse_args(
['serve', '--config', './data/test_config.json'])
['serve', 'mymodel', '--config', './data/test_config.json'])
with pytest.raises(ValueError):
parser_with_config.parse_args([
'serve', '--tensor-parallel-size', '3', '--config', '--batch-size',
'32'
'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
'--batch-size', '32'
])
def test_no_model_tag(parser_with_config):
with pytest.raises(ValueError):
parser_with_config.parse_args(
['serve', '--config', './data/test_config.yaml'])
# yapf: enable
@pytest.mark.parametrize(
"callable,kw_name,requires_kw_only,allow_var_kwargs,is_supported",
[
# Tests for positional argument support
(lambda foo: None, "foo", True, True, False),
(lambda foo: None, "foo", False, True, True),
# Tests for positional or keyword / keyword only
(lambda foo=100: None, "foo", True, True, False),
(lambda *, foo: None, "foo", False, True, True),
# Tests to make sure the names of variadic params are NOT supported
(lambda *args: None, "args", False, True, False),
(lambda **kwargs: None, "kwargs", False, True, False),
# Tests for if we allow var kwargs to add support
(lambda foo: None, "something_else", False, True, False),
(lambda foo, **kwargs: None, "something_else", False, True, True),
(lambda foo, **kwargs: None, "kwargs", True, True, False),
(lambda foo, **kwargs: None, "foo", True, True, False),
])
# yapf: disable
def test_supports_kw(callable,kw_name,requires_kw_only,
allow_var_kwargs,is_supported):
assert supports_kw(
callable=callable,
kw_name=kw_name,
requires_kw_only=requires_kw_only,
allow_var_kwargs=allow_var_kwargs
) == is_supported
import pytest
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
def test_chat_completion_request_with_no_tools():
# tools key is not present
request = ChatCompletionRequest.model_validate({
'messages': [{
'role': 'user',
'content': 'Hello'
}],
'model':
'facebook/opt-125m',
})
assert request.tool_choice == 'none'
# tools key is None
request = ChatCompletionRequest.model_validate({
'messages': [{
'role': 'user',
'content': 'Hello'
}],
'model':
'facebook/opt-125m',
'tools':
None
})
assert request.tool_choice == 'none'
# tools key present but empty
request = ChatCompletionRequest.model_validate({
'messages': [{
'role': 'user',
'content': 'Hello'
}],
'model':
'facebook/opt-125m',
'tools': []
})
assert request.tool_choice == 'none'
def test_chat_completion_request_with_tool_choice_but_no_tools():
with pytest.raises(ValueError,
match="When using `tool_choice`, `tools` must be set."):
ChatCompletionRequest.model_validate({
'messages': [{
'role': 'user',
'content': 'Hello'
}],
'model':
'facebook/opt-125m',
'tool_choice':
'auto'
})
with pytest.raises(ValueError,
match="When using `tool_choice`, `tools` must be set."):
ChatCompletionRequest.model_validate({
'messages': [{
'role': 'user',
'content': 'Hello'
}],
'model':
'facebook/opt-125m',
'tool_choice':
'auto',
'tools':
None
})
......@@ -3,18 +3,20 @@ from typing import List
import openai
import pytest
from .utils import MESSAGES_WITHOUT_TOOLS, WEATHER_TOOL
from .utils import (MESSAGES_WITHOUT_TOOLS, WEATHER_TOOL, ServerConfig,
ensure_system_prompt)
# test: make sure chat completions without tools provided work even when tools
# are enabled. This makes sure tool call chat templates work, AND that the tool
# parser stream processing doesn't change the output of the model.
@pytest.mark.asyncio
async def test_chat_completion_without_tools(client: openai.AsyncOpenAI):
async def test_chat_completion_without_tools(client: openai.AsyncOpenAI,
server_config: ServerConfig):
models = await client.models.list()
model_name: str = models.data[0].id
chat_completion = await client.chat.completions.create(
messages=MESSAGES_WITHOUT_TOOLS,
messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
temperature=0,
max_tokens=150,
model=model_name,
......@@ -34,7 +36,7 @@ async def test_chat_completion_without_tools(client: openai.AsyncOpenAI):
# make the same request, streaming
stream = await client.chat.completions.create(
messages=MESSAGES_WITHOUT_TOOLS,
messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
temperature=0,
max_tokens=150,
model=model_name,
......@@ -77,11 +79,12 @@ async def test_chat_completion_without_tools(client: openai.AsyncOpenAI):
# tools, to make sure we can still get normal chat completion responses
# and that they won't be parsed as tools
@pytest.mark.asyncio
async def test_chat_completion_with_tools(client: openai.AsyncOpenAI):
async def test_chat_completion_with_tools(client: openai.AsyncOpenAI,
server_config: ServerConfig):
models = await client.models.list()
model_name: str = models.data[0].id
chat_completion = await client.chat.completions.create(
messages=MESSAGES_WITHOUT_TOOLS,
messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
temperature=0,
max_tokens=150,
model=model_name,
......@@ -102,7 +105,7 @@ async def test_chat_completion_with_tools(client: openai.AsyncOpenAI):
# make the same request, streaming
stream = await client.chat.completions.create(
messages=MESSAGES_WITHOUT_TOOLS,
messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
temperature=0,
max_tokens=150,
model=model_name,
......
......@@ -6,7 +6,7 @@ import pytest
from .utils import (MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
MESSAGES_WITH_PARALLEL_TOOL_RESPONSE, SEARCH_TOOL,
WEATHER_TOOL)
WEATHER_TOOL, ServerConfig)
# test: getting the model to generate parallel tool calls (streaming/not)
......@@ -14,7 +14,13 @@ from .utils import (MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
# may be added in the future. e.g. llama 3.1 models are not designed to support
# parallel tool calls.
@pytest.mark.asyncio
async def test_parallel_tool_calls(client: openai.AsyncOpenAI):
async def test_parallel_tool_calls(client: openai.AsyncOpenAI,
server_config: ServerConfig):
if not server_config.get("supports_parallel", True):
pytest.skip("The {} model doesn't support parallel tool calls".format(
server_config["model"]))
models = await client.models.list()
model_name: str = models.data[0].id
chat_completion = await client.chat.completions.create(
......@@ -39,7 +45,7 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI):
assert tool_call.type == "function"
assert tool_call.function is not None
assert isinstance(tool_call.id, str)
assert len(tool_call.id) > 16
assert len(tool_call.id) >= 9
# make sure the weather tool was called correctly
assert tool_call.function.name == WEATHER_TOOL["function"]["name"]
......@@ -102,7 +108,7 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI):
if tool_call.id:
tool_call_id_count += 1
assert (isinstance(tool_call.id, str)
and (len(tool_call.id) > 16))
and (len(tool_call.id) >= 9))
# if parts of the function start being streamed
if tool_call.function:
......@@ -136,7 +142,13 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI):
# test: providing parallel tool calls back to the model to get a response
# (streaming/not)
@pytest.mark.asyncio
async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI):
async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI,
server_config: ServerConfig):
if not server_config.get("supports_parallel", True):
pytest.skip("The {} model doesn't support parallel tool calls".format(
server_config["model"]))
models = await client.models.list()
model_name: str = models.data[0].id
chat_completion = await client.chat.completions.create(
......
......@@ -33,7 +33,7 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
assert tool_calls[0].type == 'function'
assert tool_calls[0].function is not None
assert isinstance(tool_calls[0].id, str)
assert len(tool_calls[0].id) > 16
assert len(tool_calls[0].id) >= 9
# make sure the weather tool was called (classic example) with arguments
assert tool_calls[0].function.name == WEATHER_TOOL["function"]["name"]
......@@ -106,7 +106,7 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
assert finish_reason_count == 1
assert role_name == 'assistant'
assert isinstance(tool_call_id, str) and (len(tool_call_id) > 16)
assert isinstance(tool_call_id, str) and (len(tool_call_id) >= 9)
# validate the name and arguments
assert function_name == WEATHER_TOOL["function"]["name"]
......
from typing import Dict, List
from copy import deepcopy
from typing import Any, Dict, List, Optional
from openai.types.chat import (ChatCompletionMessageParam,
ChatCompletionToolParam)
......@@ -7,9 +8,30 @@ from typing_extensions import TypedDict
from tests.utils import VLLM_PATH
class ServerConfig(TypedDict):
class ServerConfig(TypedDict, total=False):
model: str
arguments: List[str]
system_prompt: Optional[str]
supports_parallel: Optional[bool]
def patch_system_prompt(messages: List[Dict[str, Any]],
system_prompt: str) -> List[Dict[str, Any]]:
new_messages = deepcopy(messages)
if new_messages[0]["role"] == "system":
new_messages[0]["content"] = system_prompt
else:
new_messages.insert(0, {"role": "system", "content": system_prompt})
return new_messages
def ensure_system_prompt(messages: List[Dict[str, Any]],
config: ServerConfig) -> List[Dict[str, Any]]:
prompt = config.get("system_prompt")
if prompt:
return patch_system_prompt(messages, prompt)
else:
return messages
# universal args for all models go here. also good if you need to test locally
......@@ -23,7 +45,33 @@ CONFIGS: Dict[str, ServerConfig] = {
"arguments": [
"--tool-call-parser", "hermes", "--chat-template",
str(VLLM_PATH / "examples/tool_chat_template_hermes.jinja")
]
],
"system_prompt":
"You are a helpful assistant with access to tools. If a tool"
" that you have would be helpful to answer a user query, "
"call the tool. Otherwise, answer the user's query directly "
"without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
"to the user's question - just respond to it normally."
},
"llama": {
"model":
"meta-llama/Meta-Llama-3.1-8B-Instruct",
"arguments": [
"--tool-call-parser", "llama3_json", "--chat-template",
str(VLLM_PATH / "examples/tool_chat_template_llama3.1_json.jinja")
],
"supports_parallel":
False,
},
"llama3.2": {
"model":
"meta-llama/Llama-3.2-3B-Instruct",
"arguments": [
"--tool-call-parser", "llama3_json", "--chat-template",
str(VLLM_PATH / "examples/tool_chat_template_llama3.2_json.jinja")
],
"supports_parallel":
False,
},
"mistral": {
"model":
......@@ -32,7 +80,25 @@ CONFIGS: Dict[str, ServerConfig] = {
"--tool-call-parser", "mistral", "--chat-template",
str(VLLM_PATH / "examples/tool_chat_template_mistral.jinja"),
"--ignore-patterns=\"consolidated.safetensors\""
]
],
"system_prompt":
"You are a helpful assistant with access to tools. If a tool"
" that you have would be helpful to answer a user query, "
"call the tool. Otherwise, answer the user's query directly "
"without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
"to the user's question - just respond to it normally."
},
"internlm": {
"model":
"internlm/internlm2_5-7b-chat",
"arguments": [
"--tool-call-parser", "internlm", "--chat-template",
str(VLLM_PATH /
"examples/tool_chat_template_internlm2_tool.jinja"),
"--trust_remote_code"
],
"supports_parallel":
False,
}
}
......@@ -55,7 +121,7 @@ WEATHER_TOOL: ChatCompletionToolParam = {
"type":
"string",
"description":
"the two-letter abbreviation for the state "
"must the two-letter abbreviation for the state "
"that the city is in, e.g. 'CA' which would "
"mean 'California'"
},
......@@ -97,15 +163,6 @@ SEARCH_TOOL: ChatCompletionToolParam = {
}
MESSAGES_WITHOUT_TOOLS: List[ChatCompletionMessageParam] = [{
"role":
"system",
"content":
"You are a helpful assistant with access to tools. If a tool"
" that you have would be helpful to answer a user query, "
"call the tool. Otherwise, answer the user's query directly "
"without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
"to the user's question - just respond to it normally."
}, {
"role":
"user",
"content":
......
......@@ -5,9 +5,11 @@ import tempfile
import depyf
from vllm.compilation.levels import CompilationLevel
# disable custom dispatcher, let Dynamo takes over
# all the control
os.environ['VLLM_DYNAMO_USE_CUSTOM_DISPATCHER'] = "0"
os.environ['VLLM_TORCH_COMPILE_LEVEL'] = str(CompilationLevel.DYNAMO_AS_IS)
temp_dir = tempfile.mkdtemp()
with depyf.prepare_debug(temp_dir):
......
import os
from vllm.compilation.levels import CompilationLevel
from ..utils import compare_two_settings
# --enforce-eager on TPU causes graph compilation
......@@ -9,8 +11,9 @@ os.environ["VLLM_RPC_TIMEOUT"] = "30000"
def test_custom_dispatcher():
compare_two_settings("google/gemma-2b",
arg1=["--enforce-eager"],
arg2=["--enforce-eager"],
env1={"VLLM_DYNAMO_USE_CUSTOM_DISPATCHER": "0"},
env2={})
compare_two_settings(
"google/gemma-2b",
arg1=["--enforce-eager"],
arg2=["--enforce-eager"],
env1={"VLLM_TORCH_COMPILE_LEVEL": str(CompilationLevel.DYNAMO_ONCE)},
env2={"VLLM_TORCH_COMPILE_LEVEL": str(CompilationLevel.DYNAMO_AS_IS)})
......@@ -98,8 +98,6 @@ def test_traces(trace_service):
SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p
assert attributes.get(
SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
assert attributes.get(
SpanAttributes.LLM_REQUEST_BEST_OF) == sampling_params.best_of
assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n
assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len(
outputs[0].prompt_token_ids)
......@@ -155,8 +153,6 @@ def test_traces_with_detailed_steps(trace_service):
SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p
assert attributes.get(
SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
assert attributes.get(
SpanAttributes.LLM_REQUEST_BEST_OF) == sampling_params.best_of
assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n
assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len(
outputs[0].prompt_token_ids)
......
import asyncio
import functools
import os
import signal
......@@ -7,15 +8,15 @@ import time
import warnings
from contextlib import contextmanager
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional
from typing import Any, Callable, Dict, List, Literal, Optional, Union
import openai
import pytest
import requests
from openai.types.completion import Completion
from transformers import AutoTokenizer
from typing_extensions import ParamSpec
from typing_extensions import ParamSpec, assert_never
import vllm.envs as envs
from tests.models.utils import TextTextLogprobs
from vllm.distributed import (ensure_model_parallel_initialized,
init_distributed_environment)
......@@ -23,8 +24,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.openai.cli_args import make_arg_parser
from vllm.model_executor.model_loader.loader import get_model_loader
from vllm.platforms import current_platform
from vllm.utils import (FlexibleArgumentParser, cuda_device_count_stateless,
get_open_port, is_hip)
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.utils import (FlexibleArgumentParser, GB_bytes,
cuda_device_count_stateless, get_open_port, is_hip)
if current_platform.is_rocm():
from amdsmi import (amdsmi_get_gpu_vram_usage,
......@@ -162,11 +164,140 @@ class RemoteOpenAIServer:
)
def _test_completion(
client: openai.OpenAI,
model: str,
prompt: str,
token_ids: List[int],
):
results = []
# test with text prompt
completion = client.completions.create(model=model,
prompt=prompt,
max_tokens=5,
temperature=0.0)
results.append({
"test": "single_completion",
"text": completion.choices[0].text,
"finish_reason": completion.choices[0].finish_reason,
"usage": completion.usage,
})
# test using token IDs
completion = client.completions.create(
model=model,
prompt=token_ids,
max_tokens=5,
temperature=0.0,
)
results.append({
"test": "token_ids",
"text": completion.choices[0].text,
"finish_reason": completion.choices[0].finish_reason,
"usage": completion.usage,
})
# test seeded random sampling
completion = client.completions.create(model=model,
prompt=prompt,
max_tokens=5,
seed=33,
temperature=1.0)
results.append({
"test": "seeded_sampling",
"text": completion.choices[0].text,
"finish_reason": completion.choices[0].finish_reason,
"usage": completion.usage,
})
# test seeded random sampling with multiple prompts
completion = client.completions.create(model=model,
prompt=[prompt, prompt],
max_tokens=5,
seed=33,
temperature=1.0)
results.append({
"test":
"seeded_sampling",
"text": [choice.text for choice in completion.choices],
"finish_reason":
[choice.finish_reason for choice in completion.choices],
"usage":
completion.usage,
})
# test simple list
batch = client.completions.create(
model=model,
prompt=[prompt, prompt],
max_tokens=5,
temperature=0.0,
)
results.append({
"test": "simple_list",
"text0": batch.choices[0].text,
"text1": batch.choices[1].text,
})
# test streaming
batch = client.completions.create(
model=model,
prompt=[prompt, prompt],
max_tokens=5,
temperature=0.0,
stream=True,
)
texts = [""] * 2
for chunk in batch:
assert len(chunk.choices) == 1
choice = chunk.choices[0]
texts[choice.index] += choice.text
results.append({
"test": "streaming",
"texts": texts,
})
return results
def _test_embeddings(
client: openai.OpenAI,
model: str,
text: str,
):
results = []
# test with text input
embeddings = client.embeddings.create(
model=model,
input=text,
encoding_format="float",
)
results.append({
"test": "single_embedding",
"embedding": embeddings.data[0].embedding,
"usage": embeddings.usage,
})
return results
def compare_two_settings(model: str,
arg1: List[str],
arg2: List[str],
env1: Optional[Dict[str, str]] = None,
env2: Optional[Dict[str, str]] = None,
*,
method: Literal["generate", "encode"] = "generate",
max_wait_seconds: Optional[float] = None) -> None:
"""
Launch API server with two different sets of arguments/environments
......@@ -180,17 +311,70 @@ def compare_two_settings(model: str,
env2: The second set of environment variables to pass to the API server.
"""
trust_remote_code = "--trust-remote-code"
if trust_remote_code in arg1 or trust_remote_code in arg2:
tokenizer = AutoTokenizer.from_pretrained(model,
trust_remote_code=True)
else:
tokenizer = AutoTokenizer.from_pretrained(model)
compare_all_settings(
model,
[arg1, arg2],
[env1, env2],
method=method,
max_wait_seconds=max_wait_seconds,
)
def compare_all_settings(model: str,
all_args: List[List[str]],
all_envs: List[Optional[Dict[str, str]]],
*,
method: Literal["generate", "encode"] = "generate",
max_wait_seconds: Optional[float] = None) -> None:
"""
Launch API server with several different sets of arguments/environments
and compare the results of the API calls with the first set of arguments.
Args:
model: The model to test.
all_args: A list of argument lists to pass to the API server.
all_envs: A list of environment dictionaries to pass to the API server.
"""
trust_remote_code = False
for args in all_args:
if "--trust-remote-code" in args:
trust_remote_code = True
break
tokenizer_mode = "auto"
for args in all_args:
if "--tokenizer-mode" in args:
tokenizer_mode = args[args.index("--tokenizer-mode") + 1]
break
tokenizer = get_tokenizer(
model,
trust_remote_code=trust_remote_code,
tokenizer_mode=tokenizer_mode,
)
can_force_load_format = True
for args in all_args:
if "--load-format" in args:
can_force_load_format = False
break
prompt = "Hello, my name is"
token_ids = tokenizer(prompt)["input_ids"]
results = []
for args, env in ((arg1, env1), (arg2, env2)):
token_ids = tokenizer(prompt).input_ids
ref_results: List = []
for i, (args, env) in enumerate(zip(all_args, all_envs)):
if can_force_load_format:
# we are comparing the results and
# usually we don't need real weights.
# we force to use dummy weights by default,
# and it should work for most of the cases.
# if not, we can use VLLM_TEST_FORCE_LOAD_FORMAT
# environment variable to force the load format,
# e.g. in quantization tests.
args = args + ["--load-format", envs.VLLM_TEST_FORCE_LOAD_FORMAT]
compare_results: List = []
results = ref_results if i == 0 else compare_results
with RemoteOpenAIServer(model,
args,
env_dict=env,
......@@ -207,104 +391,27 @@ def compare_two_settings(model: str,
"root": served_model.root,
})
# test with text prompt
completion = client.completions.create(model=model,
prompt=prompt,
max_tokens=5,
temperature=0.0)
results.append({
"test": "single_completion",
"text": completion.choices[0].text,
"finish_reason": completion.choices[0].finish_reason,
"usage": completion.usage,
})
# test using token IDs
completion = client.completions.create(
model=model,
prompt=token_ids,
max_tokens=5,
temperature=0.0,
)
results.append({
"test": "token_ids",
"text": completion.choices[0].text,
"finish_reason": completion.choices[0].finish_reason,
"usage": completion.usage,
})
# test seeded random sampling
completion = client.completions.create(model=model,
prompt=prompt,
max_tokens=5,
seed=33,
temperature=1.0)
results.append({
"test": "seeded_sampling",
"text": completion.choices[0].text,
"finish_reason": completion.choices[0].finish_reason,
"usage": completion.usage,
})
# test seeded random sampling with multiple prompts
completion = client.completions.create(model=model,
prompt=[prompt, prompt],
max_tokens=5,
seed=33,
temperature=1.0)
results.append({
"test":
"seeded_sampling",
"text": [choice.text for choice in completion.choices],
"finish_reason":
[choice.finish_reason for choice in completion.choices],
"usage":
completion.usage,
})
# test simple list
batch = client.completions.create(
model=model,
prompt=[prompt, prompt],
max_tokens=5,
temperature=0.0,
)
results.append({
"test": "simple_list",
"text0": batch.choices[0].text,
"text1": batch.choices[1].text,
})
# test streaming
batch = client.completions.create(
model=model,
prompt=[prompt, prompt],
max_tokens=5,
temperature=0.0,
stream=True,
)
texts = [""] * 2
for chunk in batch:
assert len(chunk.choices) == 1
choice = chunk.choices[0]
texts[choice.index] += choice.text
results.append({
"test": "streaming",
"texts": texts,
})
n = len(results) // 2
arg1_results = results[:n]
arg2_results = results[n:]
for arg1_result, arg2_result in zip(arg1_results, arg2_results):
assert arg1_result == arg2_result, (
f"Results for {model=} are not the same with {arg1=} and {arg2=}. "
f"{arg1_result=} != {arg2_result=}")
if method == "generate":
results += _test_completion(client, model, prompt, token_ids)
elif method == "encode":
results += _test_embeddings(client, model, prompt)
else:
assert_never(method)
if i > 0:
# if any setting fails, raise an error early
ref_args = all_args[0]
ref_envs = all_envs[0]
compare_args = all_args[i]
compare_envs = all_envs[i]
for ref_result, compare_result in zip(ref_results,
compare_results):
assert ref_result == compare_result, (
f"Results for {model=} are not the same.\n"
f"{ref_args=} {ref_envs=}\n"
f"{compare_args=} {compare_envs=}\n"
f"{ref_result=}\n"
f"{compare_result=}\n")
def init_test_distributed_environment(
......@@ -454,6 +561,37 @@ def fork_new_process_for_each_test(
return wrapper
def large_gpu_test(*, min_gb: int):
"""
Decorate a test to be skipped if no GPU is available or it does not have
sufficient memory.
Currently, the CI machine uses L4 GPU which has 24 GB VRAM.
"""
try:
if current_platform.is_cpu():
memory_gb = 0
else:
memory_gb = current_platform.get_device_total_memory() / GB_bytes
except Exception as e:
warnings.warn(
f"An error occurred when finding the available memory: {e}",
stacklevel=2,
)
memory_gb = 0
test_skipif = pytest.mark.skipif(
memory_gb < min_gb,
reason=f"Need at least {memory_gb}GB GPU memory to run the test.",
)
def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
return test_skipif(fork_new_process_for_each_test(f))
return wrapper
def multi_gpu_test(*, num_gpus: int):
"""
Decorate a test to be run only when multiple GPUs are available.
......@@ -476,7 +614,8 @@ async def completions_with_server_args(
server_cli_args: List[str],
num_logprobs: Optional[int],
max_wait_seconds: int = 240,
) -> Completion:
max_tokens: Union[int, list] = 5,
) -> List[Completion]:
'''Construct a remote OpenAI server, obtain an async client to the
server & invoke the completions API to obtain completions.
......@@ -487,37 +626,49 @@ async def completions_with_server_args(
num_logprobs: Number of logprobs to report (or `None`)
max_wait_seconds: timeout interval for bringing up server.
Default: 240sec
max_tokens: max_tokens value for each of the given input prompts.
if only one max_token value is given, the same value is used
for all the prompts.
Returns:
OpenAI Completion instance
'''
if isinstance(max_tokens, int):
max_tokens = [max_tokens] * len(prompts)
assert len(max_tokens) == len(prompts)
outputs = None
max_wait_seconds = 240 * 3 # 240 is default
with RemoteOpenAIServer(model_name,
server_cli_args,
max_wait_seconds=max_wait_seconds) as server:
client = server.get_async_client()
outputs = await client.completions.create(model=model_name,
prompt=prompts,
temperature=0,
stream=False,
max_tokens=5,
logprobs=num_logprobs)
outputs = [ client.completions.create(model=model_name,
prompt=[p],
temperature=0,
stream=False,
max_tokens=max_tok,
logprobs=num_logprobs) \
for p, max_tok in zip(prompts, max_tokens) ]
outputs = await asyncio.gather(*outputs)
assert outputs is not None, "Completion API call failed."
return outputs
def get_client_text_generations(completions: Completion) -> List[str]:
def get_client_text_generations(completions: List[Completion]) -> List[str]:
'''Extract generated tokens from the output of a
request made to an Open-AI-protocol completions endpoint.
'''
return [x.text for x in completions.choices]
assert all([len(x.choices) == 1 for x in completions])
return [x.choices[0].text for x in completions]
def get_client_text_logprob_generations(
completions: Completion) -> List[TextTextLogprobs]:
completions: List[Completion]) -> List[TextTextLogprobs]:
'''Operates on the output of a request made to an Open-AI-protocol
completions endpoint; obtains top-rank logprobs for each token in
each :class:`SequenceGroup`
......@@ -526,4 +677,4 @@ def get_client_text_logprob_generations(
text = ''.join(text_generations)
return [(text_generations, text,
(None if x.logprobs is None else x.logprobs.top_logprobs))
for x in completions.choices]
for completion in completions for x in completion.choices]
compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main
compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main
compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W8A16-quantized, main
compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, main
awq_marlin, casperhansen/deepseek-coder-v2-instruct-awq, main
\ No newline at end of file
......@@ -20,6 +20,7 @@ compressed-tensors, nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test, main
compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
compressed-tensors, nm-testing/TinyLlama-1.1B-Chat-v1.0-actorder-group, main
compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
awq, casperhansen/mixtral-instruct-awq, main
awq_marlin, casperhansen/mixtral-instruct-awq, main
fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main
......
#!/bin/bash
SUCCESS=0
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "weight_loading/models.txt"
while getopts "c:" OPT; do
case ${OPT} in
c )
CONFIG="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
do
......
#!/bin/bash
if command -v actionlint &> /dev/null; then
actionlint "$@"
exit 0
elif [ -x ./actionlint ]; then
./actionlint "$@"
exit 0
fi
# download a binary to the current directory - v1.7.3
bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash)
./actionlint "$@"
#!/bin/bash
# Checks whether the repo is clean and whether tags are available (necessary to correctly produce vllm version at build time)
if ! git diff --quiet; then
echo "Repo is dirty" >&2
exit 1
fi
if ! git describe --tags; then
echo "No tags are present. Is this a shallow clone? git fetch --unshallow --tags" >&2
exit 1
fi
#!/bin/bash
CI=${1:-0}
run_mypy() {
echo "Running mypy on $1"
if [ $CI -eq 1 ] && [ -z "$1" ]; then
mypy "$@"
return
fi
mypy --follow-imports skip "$@"
}
run_mypy # Note that this is less strict than CI
run_mypy tests
run_mypy vllm/attention
run_mypy vllm/compilation
run_mypy vllm/distributed
run_mypy vllm/engine
run_mypy vllm/executor
run_mypy vllm/lora
run_mypy vllm/model_executor
run_mypy vllm/plugins
run_mypy vllm/prompt_adapter
run_mypy vllm/spec_decode
run_mypy vllm/worker
import argparse
import json
from typing import Dict
from vllm.profiler.layerwise_profile import ModelStatsEntry, SummaryStatsEntry
from vllm.profiler.utils import TablePrinter, indent_string
def flatten_entries(entry_cls, profile_dict: Dict):
entries_and_depth = []
def get_entries(node, curr_depth=0):
entries_and_depth.append((entry_cls(**node["entry"]), curr_depth))
for child in node["children"]:
get_entries(
child,
curr_depth=curr_depth + 1,
)
for root in profile_dict:
get_entries(root)
return entries_and_depth
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--json-trace",
type=str,
required=True,
help="json trace file output by "
"examples/offline_profile.py")
parser.add_argument("--phase",
type=str,
choices=["prefill", "decode_1"],
required=True,
help="The phase to print the table for.")
parser.add_argument("--table",
type=str,
choices=["summary", "model"],
default="summary",
help="Which table to print, the summary table or the "
"layerwise model table")
args = parser.parse_args()
with open(args.json_trace, "r") as f:
profile_data = json.load(f)
if args.table == "summary":
entries_and_depths = flatten_entries(
SummaryStatsEntry, profile_data[args.phase]["summary_stats"])
column_widths = dict(name=80,
cuda_time_us=12,
pct_cuda_time=12,
invocations=15)
elif args.table == "model":
entries_and_depths = flatten_entries(
ModelStatsEntry, profile_data[args.phase]["model_stats"])
column_widths = dict(name=60,
cpu_time_us=12,
cuda_time_us=12,
pct_cuda_time=12,
trace=60)
# indent entry names based on the depth
entries = []
for entry, depth in entries_and_depths:
entry.name = indent_string(
entry.name,
indent=depth,
indent_style=lambda indent: "|" + "-" * indent + " ")
entries.append(entry)
TablePrinter(type(entries[0]), column_widths).print_table(entries)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment