Commit 96ae75ad authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.6.post1' into v0.6.6.post1-dev

parents f9f4a735 2339d59f
...@@ -11,15 +11,16 @@ protobuf # Required by LlamaTokenizer. ...@@ -11,15 +11,16 @@ protobuf # Required by LlamaTokenizer.
fastapi >= 0.107.0, < 0.113.0; python_version < '3.9' fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9' fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
aiohttp aiohttp
openai >= 1.45.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support) openai >= 1.52.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
uvicorn[standard] uvicorn[standard]
pydantic >= 2.9 # Required for fastapi >= 0.113.0 pydantic >= 2.9 # Required for fastapi >= 0.113.0
pillow # Required for image processing
prometheus_client >= 0.18.0 prometheus_client >= 0.18.0
pillow # Required for image processing
prometheus-fastapi-instrumentator >= 7.0.0 prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer >= 0.10.9, < 0.11 lm-format-enforcer >= 0.10.9, < 0.11
outlines == 0.1.11 outlines == 0.1.11 # Requires pytorch
lark == 1.2.2
xgrammar >= 0.1.6; platform_machine == "x86_64" xgrammar >= 0.1.6; platform_machine == "x86_64"
typing_extensions >= 4.10 typing_extensions >= 4.10
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
...@@ -33,5 +34,6 @@ pyyaml ...@@ -33,5 +34,6 @@ pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
einops # Required for Qwen2-VL. einops # Required for Qwen2-VL.
compressed-tensors == 0.8.1 # required for compressed-tensors compressed-tensors == 0.8.1 # required for compressed-tensors, requires pytorch
depyf==0.18.0 # required for profiling and debugging torch.compile depyf==0.18.0 # required for profiling and debugging with compilation config
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
--index-url https://download.pytorch.org/whl/nightly/cu124
torchvision==0.22.0.dev20241215; platform_machine == 'aarch64'
torch==2.6.0.dev20241210+cu124; platform_machine == 'aarch64'
...@@ -2,9 +2,9 @@ ...@@ -2,9 +2,9 @@
-r requirements-common.txt -r requirements-common.txt
# Dependencies for NVIDIA GPUs # Dependencies for NVIDIA GPUs
ray >= 2.9 ray[default] >= 2.9
nvidia-ml-py >= 12.560.30 # for pynvml package nvidia-ml-py >= 12.560.30 # for pynvml package
torch == 2.5.1; platform_machine != 'aarch64' torch == 2.5.1
# These must be updated alongside torch # These must be updated alongside torch
torchvision == 0.20.1; platform_machine != 'aarch64' # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.1 xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.1
...@@ -4,5 +4,5 @@ ...@@ -4,5 +4,5 @@
torch == 2.5.1 # should be aligned with "common" vLLM torch version torch == 2.5.1 # should be aligned with "common" vLLM torch version
openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version optimum @ git+https://github.com/huggingface/optimum.git # latest optimum is used to support latest transformers version
optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git@main # latest optimum-intel is used to support latest transformers version optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git # latest optimum-intel is used to support latest transformers version
...@@ -9,8 +9,8 @@ setuptools-scm>=8 ...@@ -9,8 +9,8 @@ setuptools-scm>=8
wheel wheel
jinja2 jinja2
torch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl torch @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl
intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl intel-extension-for-pytorch @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl
oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl oneccl_bind_pt @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl
triton-xpu == 3.0.0b1 triton-xpu == 3.0.0b1
...@@ -482,9 +482,9 @@ def get_version_add(sha: Optional[str] = None) -> str: ...@@ -482,9 +482,9 @@ def get_version_add(sha: Optional[str] = None) -> str:
new_version_content = f""" new_version_content = f"""
try: try:
__version__ = "0.6.5" __version__ = "0.6.6.post1"
__version_tuple__ = (0, 6, 5) __version_tuple__ = (0, 6, 6)
__hcu_version__ = f'0.6.5+{version}' __hcu_version__ = f'0.6.6.post1+{version}'
from vllm.version import __version__, __version_tuple__, __hcu_version__ from vllm.version import __version__, __version_tuple__, __hcu_version__
except Exception as e: except Exception as e:
...@@ -493,6 +493,8 @@ except Exception as e: ...@@ -493,6 +493,8 @@ except Exception as e:
warnings.warn(f"Failed to read commit hash:\\n + str(e)", warnings.warn(f"Failed to read commit hash:\\n + str(e)",
RuntimeWarning, RuntimeWarning,
stacklevel=2) stacklevel=2)
__version__ = "dev"
__version_tuple__ = (0, 0, __version__)
""" """
with open(add_version_path, encoding="utf-8",mode="w") as file: with open(add_version_path, encoding="utf-8",mode="w") as file:
...@@ -525,10 +527,14 @@ def get_gaudi_sw_version(): ...@@ -525,10 +527,14 @@ def get_gaudi_sw_version():
def get_vllm_version() -> str: def get_vllm_version() -> str:
if not _is_hip(): # TODO: Revisit this temporary approach: https://github.com/vllm-project/vllm/issues/9182#issuecomment-2404860236
version = get_version( try:
write_to="vllm/_version.py", # TODO: move this to pyproject.toml if not _is_hip():
) version = get_version(
write_to="vllm/_version.py", # TODO: move this to pyproject.toml
)
except LookupError:
version = "0.0.0"
sep = "+" if "+" not in version else "." # dev versions might contain + sep = "+" if "+" not in version else "." # dev versions might contain +
...@@ -537,7 +543,7 @@ def get_vllm_version() -> str: ...@@ -537,7 +543,7 @@ def get_vllm_version() -> str:
version += f"{sep}empty" version += f"{sep}empty"
elif _is_cuda(): elif _is_cuda():
if envs.VLLM_USE_PRECOMPILED: if envs.VLLM_USE_PRECOMPILED:
version += ".precompiled" version += f"{sep}precompiled"
else: else:
cuda_version = str(get_nvcc_cuda_version()) cuda_version = str(get_nvcc_cuda_version())
if cuda_version != MAIN_CUDA_VERSION: if cuda_version != MAIN_CUDA_VERSION:
...@@ -702,6 +708,7 @@ setup( ...@@ -702,6 +708,7 @@ setup(
ext_modules=ext_modules, ext_modules=ext_modules,
extras_require={ extras_require={
"tensorizer": ["tensorizer>=2.9.0"], "tensorizer": ["tensorizer>=2.9.0"],
"runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
"audio": ["librosa", "soundfile"], # Required for audio processing "audio": ["librosa", "soundfile"], # Required for audio processing
"video": ["decord"] # Required for video processing "video": ["decord"] # Required for video processing
}, },
......
...@@ -122,7 +122,7 @@ def test_models( ...@@ -122,7 +122,7 @@ def test_models(
# if test_suite != TARGET_TEST_SUITE: # if test_suite != TARGET_TEST_SUITE:
# pytest.skip(f"Skip test for {test_suite}") # pytest.skip(f"Skip test for {test_suite}")
# if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa # if model == os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf") and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
# # test ray adag # # test ray adag
# os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" # os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
# os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" # os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
...@@ -130,11 +130,6 @@ def test_models( ...@@ -130,11 +130,6 @@ def test_models(
# if attention_backend: # if attention_backend:
# os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend # os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
# # Import VLLM_USE_V1 dynamically to handle patching
# from vllm.envs import VLLM_USE_V1
# if VLLM_USE_V1 and distributed_executor_backend != "mp":
# pytest.skip(f"Skip {distributed_executor_backend} for V1")
# dtype = "half" # dtype = "half"
# max_tokens = 5 # max_tokens = 5
...@@ -153,11 +148,11 @@ def test_models( ...@@ -153,11 +148,11 @@ def test_models(
# hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) # hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
# check_outputs_equal( # check_outputs_equal(
# outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
# outputs_1_lst=vllm_outputs, outputs_1_lst=vllm_outputs,
# name_0="hf", name_0="hf",
# name_1="vllm", name_1="vllm",
# ) )
@pytest.mark.skip_v1 @pytest.mark.skip_v1
......
...@@ -13,7 +13,8 @@ from vllm.outputs import RequestOutput ...@@ -13,7 +13,8 @@ from vllm.outputs import RequestOutput
from vllm.sampling_params import GuidedDecodingParams, SamplingParams from vllm.sampling_params import GuidedDecodingParams, SamplingParams
from ...utils import models_path_prefix from ...utils import models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta") MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2.5-7B-Instruct")
GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
...@@ -29,11 +30,13 @@ def llm(): ...@@ -29,11 +30,13 @@ def llm():
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
def test_guided_regex(sample_regex, llm): @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
sampling_params = SamplingParams( def test_guided_regex(sample_regex, llm, guided_decoding_backend: str):
temperature=0.8, sampling_params = SamplingParams(temperature=0.8,
top_p=0.95, top_p=0.95,
guided_decoding=GuidedDecodingParams(regex=sample_regex)) guided_decoding=GuidedDecodingParams(
regex=sample_regex,
backend=guided_decoding_backend))
outputs = llm.generate(prompts=[ outputs = llm.generate(prompts=[
f"Give an example IPv4 address with this regex: {sample_regex}" f"Give an example IPv4 address with this regex: {sample_regex}"
] * 2, ] * 2,
...@@ -53,11 +56,14 @@ def test_guided_regex(sample_regex, llm): ...@@ -53,11 +56,14 @@ def test_guided_regex(sample_regex, llm):
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
def test_guided_json_completion(sample_json_schema, llm): @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
sampling_params = SamplingParams( def test_guided_json_completion(sample_json_schema, llm,
temperature=1.0, guided_decoding_backend: str):
max_tokens=1000, sampling_params = SamplingParams(temperature=1.0,
guided_decoding=GuidedDecodingParams(json=sample_json_schema)) max_tokens=1000,
guided_decoding=GuidedDecodingParams(
json=sample_json_schema,
backend=guided_decoding_backend))
outputs = llm.generate(prompts=[ outputs = llm.generate(prompts=[
f"Give an example JSON for an employee profile " f"Give an example JSON for an employee profile "
f"that fits this schema: {sample_json_schema}" f"that fits this schema: {sample_json_schema}"
...@@ -80,11 +86,14 @@ def test_guided_json_completion(sample_json_schema, llm): ...@@ -80,11 +86,14 @@ def test_guided_json_completion(sample_json_schema, llm):
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
def test_guided_complex_json_completion(sample_complex_json_schema, llm): @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
sampling_params = SamplingParams( def test_guided_complex_json_completion(sample_complex_json_schema, llm,
temperature=1.0, guided_decoding_backend: str):
max_tokens=1000, sampling_params = SamplingParams(temperature=1.0,
guided_decoding=GuidedDecodingParams(json=sample_complex_json_schema)) max_tokens=1000,
guided_decoding=GuidedDecodingParams(
json=sample_complex_json_schema,
backend=guided_decoding_backend))
outputs = llm.generate(prompts=[ outputs = llm.generate(prompts=[
f"Give an example JSON for an assignment grade " f"Give an example JSON for an assignment grade "
f"that fits this schema: {sample_complex_json_schema}" f"that fits this schema: {sample_complex_json_schema}"
...@@ -108,11 +117,14 @@ def test_guided_complex_json_completion(sample_complex_json_schema, llm): ...@@ -108,11 +117,14 @@ def test_guided_complex_json_completion(sample_complex_json_schema, llm):
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
def test_guided_definition_json_completion(sample_definition_json_schema, llm): @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
def test_guided_definition_json_completion(sample_definition_json_schema, llm,
guided_decoding_backend: str):
sampling_params = SamplingParams(temperature=1.0, sampling_params = SamplingParams(temperature=1.0,
max_tokens=1000, max_tokens=1000,
guided_decoding=GuidedDecodingParams( guided_decoding=GuidedDecodingParams(
json=sample_definition_json_schema)) json=sample_definition_json_schema,
backend=guided_decoding_backend))
outputs = llm.generate(prompts=[ outputs = llm.generate(prompts=[
f"Give an example JSON for solving 8x + 7 = -23 " f"Give an example JSON for solving 8x + 7 = -23 "
f"that fits this schema: {sample_definition_json_schema}" f"that fits this schema: {sample_definition_json_schema}"
...@@ -136,11 +148,14 @@ def test_guided_definition_json_completion(sample_definition_json_schema, llm): ...@@ -136,11 +148,14 @@ def test_guided_definition_json_completion(sample_definition_json_schema, llm):
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
def test_guided_choice_completion(sample_guided_choice, llm): @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
sampling_params = SamplingParams( def test_guided_choice_completion(sample_guided_choice, llm,
temperature=0.8, guided_decoding_backend: str):
top_p=0.95, sampling_params = SamplingParams(temperature=0.8,
guided_decoding=GuidedDecodingParams(choice=sample_guided_choice)) top_p=0.95,
guided_decoding=GuidedDecodingParams(
choice=sample_guided_choice,
backend=guided_decoding_backend))
outputs = llm.generate( outputs = llm.generate(
prompts="The best language for type-safe systems programming is ", prompts="The best language for type-safe systems programming is ",
sampling_params=sampling_params, sampling_params=sampling_params,
...@@ -159,13 +174,15 @@ def test_guided_choice_completion(sample_guided_choice, llm): ...@@ -159,13 +174,15 @@ def test_guided_choice_completion(sample_guided_choice, llm):
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
def test_guided_grammar(sample_sql_statements, llm): @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
def test_guided_grammar(sample_sql_statements, llm,
sampling_params = SamplingParams( guided_decoding_backend: str):
temperature=0.8, sampling_params = SamplingParams(temperature=0.8,
top_p=0.95, top_p=0.95,
max_tokens=1000, max_tokens=1000,
guided_decoding=GuidedDecodingParams(grammar=sample_sql_statements)) guided_decoding=GuidedDecodingParams(
grammar=sample_sql_statements,
backend=guided_decoding_backend))
outputs = llm.generate( outputs = llm.generate(
prompts=("Generate a sql state that select col_1 from " prompts=("Generate a sql state that select col_1 from "
"table_1 where it is equals to 1"), "table_1 where it is equals to 1"),
...@@ -221,15 +238,18 @@ def test_validation_against_both_guided_decoding_options(sample_regex, llm): ...@@ -221,15 +238,18 @@ def test_validation_against_both_guided_decoding_options(sample_regex, llm):
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
def test_guided_json_object(llm): @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
sampling_params = SamplingParams( def test_guided_json_object(llm, guided_decoding_backend: str):
temperature=1.0, sampling_params = SamplingParams(temperature=1.0,
max_tokens=100, max_tokens=100,
guided_decoding=GuidedDecodingParams(json_object=True)) n=2,
guided_decoding=GuidedDecodingParams(
json_object=True,
backend=guided_decoding_backend))
outputs = llm.generate( outputs = llm.generate(
prompts=("Generate a JSON object describing a person with name " prompts=("Generate a JSON object with curly braces for a person with "
"and age for John Smith who is 31 years old."), "name and age fields for John Smith who is 31 years old."),
sampling_params=sampling_params, sampling_params=sampling_params,
use_tqdm=True) use_tqdm=True)
...@@ -238,10 +258,11 @@ def test_guided_json_object(llm): ...@@ -238,10 +258,11 @@ def test_guided_json_object(llm):
assert output is not None assert output is not None
assert isinstance(output, RequestOutput) assert isinstance(output, RequestOutput)
generated_text = output.outputs[0].text for i in range(2):
print(generated_text) generated_text = output.outputs[i].text
assert generated_text is not None print(generated_text)
assert generated_text is not None
# Parse to verify it is valid JSON # Parse to verify it is valid JSON
parsed_json = json.loads(generated_text) parsed_json = json.loads(generated_text)
assert isinstance(parsed_json, dict) assert isinstance(parsed_json, dict)
...@@ -77,6 +77,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI, ...@@ -77,6 +77,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
messages=messages, messages=messages,
max_completion_tokens=10, max_completion_tokens=10,
logprobs=True, logprobs=True,
temperature=0.0,
top_logprobs=5) top_logprobs=5)
assert len(chat_completion.choices) == 1 assert len(chat_completion.choices) == 1
...@@ -133,6 +134,7 @@ async def test_single_chat_session_audio_base64encoded( ...@@ -133,6 +134,7 @@ async def test_single_chat_session_audio_base64encoded(
messages=messages, messages=messages,
max_completion_tokens=10, max_completion_tokens=10,
logprobs=True, logprobs=True,
temperature=0.0,
top_logprobs=5) top_logprobs=5)
assert len(chat_completion.choices) == 1 assert len(chat_completion.choices) == 1
...@@ -153,6 +155,7 @@ async def test_single_chat_session_audio_base64encoded( ...@@ -153,6 +155,7 @@ async def test_single_chat_session_audio_base64encoded(
model=model_name, model=model_name,
messages=messages, messages=messages,
max_completion_tokens=10, max_completion_tokens=10,
temperature=0.0,
) )
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0 assert message.content is not None and len(message.content) >= 0
......
...@@ -18,6 +18,8 @@ from .test_completion import zephyr_lora_files # noqa: F401 ...@@ -18,6 +18,8 @@ from .test_completion import zephyr_lora_files # noqa: F401
# any model with a chat template should work here # any model with a chat template should work here
MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta") MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(zephyr_lora_files, zephyr_lora_added_tokens_files): # noqa: F811 def server(zephyr_lora_files, zephyr_lora_added_tokens_files): # noqa: F811
...@@ -469,8 +471,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, ...@@ -469,8 +471,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
# will fail on the second `guided_decoding_backend` even when I swap their order # will fail on the second `guided_decoding_backend` even when I swap their order
# (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256) # (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256)
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
["outlines", "lm-format-enforcer"])
async def test_guided_choice_chat(client: openai.AsyncOpenAI, async def test_guided_choice_chat(client: openai.AsyncOpenAI,
guided_decoding_backend: str, guided_decoding_backend: str,
sample_guided_choice): sample_guided_choice):
...@@ -487,6 +488,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, ...@@ -487,6 +488,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
model=MODEL_NAME, model=MODEL_NAME,
messages=messages, messages=messages,
max_completion_tokens=10, max_completion_tokens=10,
temperature=0.7,
extra_body=dict(guided_choice=sample_guided_choice, extra_body=dict(guided_choice=sample_guided_choice,
guided_decoding_backend=guided_decoding_backend)) guided_decoding_backend=guided_decoding_backend))
choice1 = chat_completion.choices[0].message.content choice1 = chat_completion.choices[0].message.content
...@@ -501,6 +503,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, ...@@ -501,6 +503,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
model=MODEL_NAME, model=MODEL_NAME,
messages=messages, messages=messages,
max_completion_tokens=10, max_completion_tokens=10,
temperature=0.7,
extra_body=dict(guided_choice=sample_guided_choice, extra_body=dict(guided_choice=sample_guided_choice,
guided_decoding_backend=guided_decoding_backend)) guided_decoding_backend=guided_decoding_backend))
choice2 = chat_completion.choices[0].message.content choice2 = chat_completion.choices[0].message.content
...@@ -509,8 +512,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, ...@@ -509,8 +512,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
["outlines", "lm-format-enforcer"])
async def test_guided_json_chat(client: openai.AsyncOpenAI, async def test_guided_json_chat(client: openai.AsyncOpenAI,
guided_decoding_backend: str, guided_decoding_backend: str,
sample_json_schema): sample_json_schema):
...@@ -557,8 +559,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, ...@@ -557,8 +559,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
["outlines", "lm-format-enforcer"])
async def test_guided_regex_chat(client: openai.AsyncOpenAI, async def test_guided_regex_chat(client: openai.AsyncOpenAI,
guided_decoding_backend: str, sample_regex): guided_decoding_backend: str, sample_regex):
messages = [{ messages = [{
...@@ -616,8 +617,7 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI): ...@@ -616,8 +617,7 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
["outlines", "lm-format-enforcer"])
async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
guided_decoding_backend: str, guided_decoding_backend: str,
sample_guided_choice): sample_guided_choice):
...@@ -649,8 +649,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, ...@@ -649,8 +649,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
["outlines", "lm-format-enforcer"])
async def test_named_tool_use(client: openai.AsyncOpenAI, async def test_named_tool_use(client: openai.AsyncOpenAI,
guided_decoding_backend: str, guided_decoding_backend: str,
sample_json_schema): sample_json_schema):
...@@ -684,7 +683,8 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, ...@@ -684,7 +683,8 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
"function": { "function": {
"name": "dummy_function_name" "name": "dummy_function_name"
} }
}) },
extra_body=dict(guided_decoding_backend=guided_decoding_backend))
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
assert len(message.content) == 0 assert len(message.content) == 0
json_string = message.tool_calls[0].function.arguments json_string = message.tool_calls[0].function.arguments
...@@ -719,6 +719,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, ...@@ -719,6 +719,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
"name": "dummy_function_name" "name": "dummy_function_name"
} }
}, },
extra_body=dict(guided_decoding_backend=guided_decoding_backend),
stream=True) stream=True)
output = [] output = []
...@@ -741,10 +742,8 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, ...@@ -741,10 +742,8 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", ["outlines"]) async def test_required_tool_use_not_yet_supported(client: openai.AsyncOpenAI,
async def test_required_tool_use_not_yet_supported( sample_json_schema):
client: openai.AsyncOpenAI, guided_decoding_backend: str,
sample_json_schema):
messages = [{ messages = [{
"role": "system", "role": "system",
"content": "you are a helpful assistant" "content": "you are a helpful assistant"
...@@ -788,9 +787,7 @@ async def test_required_tool_use_not_yet_supported( ...@@ -788,9 +787,7 @@ async def test_required_tool_use_not_yet_supported(
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI, async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
guided_decoding_backend: str,
sample_json_schema): sample_json_schema):
messages = [{ messages = [{
"role": "system", "role": "system",
......
...@@ -7,6 +7,7 @@ import os ...@@ -7,6 +7,7 @@ import os
import pytest_asyncio import pytest_asyncio
import requests import requests
from vllm.entrypoints.openai.protocol import EmbeddingResponse
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
from ...utils import RemoteOpenAIServer, models_path_prefix from ...utils import RemoteOpenAIServer, models_path_prefix
...@@ -18,6 +19,8 @@ DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + ...@@ -18,6 +19,8 @@ DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' +
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
args = [ args = [
"--task",
"embed",
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",
"bfloat16", "bfloat16",
...@@ -46,11 +49,14 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str): ...@@ -46,11 +49,14 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
] ]
# test single embedding # test single embedding
embeddings = await client.embeddings.create( embedding_response = await client.embeddings.create(
model=model_name, model=model_name,
input=input_texts, input=input_texts,
encoding_format="float", encoding_format="float",
) )
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json"))
assert embeddings.id is not None assert embeddings.id is not None
assert len(embeddings.data) == 1 assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096 assert len(embeddings.data[0].embedding) == 4096
...@@ -60,11 +66,14 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str): ...@@ -60,11 +66,14 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
# test using token IDs # test using token IDs
input_tokens = [1, 1, 1, 1, 1] input_tokens = [1, 1, 1, 1, 1]
embeddings = await client.embeddings.create( embedding_response = await client.embeddings.create(
model=model_name, model=model_name,
input=input_tokens, input=input_tokens,
encoding_format="float", encoding_format="float",
) )
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json"))
assert embeddings.id is not None assert embeddings.id is not None
assert len(embeddings.data) == 1 assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096 assert len(embeddings.data[0].embedding) == 4096
...@@ -81,11 +90,14 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str): ...@@ -81,11 +90,14 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
"The cat sat on the mat.", "A feline was resting on a rug.", "The cat sat on the mat.", "A feline was resting on a rug.",
"Stars twinkle brightly in the night sky." "Stars twinkle brightly in the night sky."
] ]
embeddings = await client.embeddings.create( embedding_response = await client.embeddings.create(
model=model_name, model=model_name,
input=input_texts, input=input_texts,
encoding_format="float", encoding_format="float",
) )
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json"))
assert embeddings.id is not None assert embeddings.id is not None
assert len(embeddings.data) == 3 assert len(embeddings.data) == 3
assert len(embeddings.data[0].embedding) == 4096 assert len(embeddings.data[0].embedding) == 4096
...@@ -96,11 +108,14 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str): ...@@ -96,11 +108,14 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
# test List[List[int]] # test List[List[int]]
input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
[25, 32, 64, 77]] [25, 32, 64, 77]]
embeddings = await client.embeddings.create( embedding_response = await client.embeddings.create(
model=model_name, model=model_name,
input=input_tokens, input=input_tokens,
encoding_format="float", encoding_format="float",
) )
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json"))
assert embeddings.id is not None assert embeddings.id is not None
assert len(embeddings.data) == 4 assert len(embeddings.data) == 4
assert len(embeddings.data[0].embedding) == 4096 assert len(embeddings.data[0].embedding) == 4096
...@@ -125,14 +140,16 @@ async def test_conversation_embedding(server: RemoteOpenAIServer, ...@@ -125,14 +140,16 @@ async def test_conversation_embedding(server: RemoteOpenAIServer,
"content": "Stars twinkle brightly in the night sky.", "content": "Stars twinkle brightly in the night sky.",
}] }]
chat_response = requests.post(server.url_for("v1/embeddings"), chat_response = requests.post(
json={ server.url_for("v1/embeddings"),
"model": model_name, json={
"messages": messages, "model": model_name,
"encoding_format": "float", "messages": messages,
}) "encoding_format": "float",
},
)
chat_response.raise_for_status() chat_response.raise_for_status()
chat_embeddings = chat_response.json() chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast") tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
prompt = tokenizer.apply_chat_template( prompt = tokenizer.apply_chat_template(
...@@ -149,13 +166,15 @@ async def test_conversation_embedding(server: RemoteOpenAIServer, ...@@ -149,13 +166,15 @@ async def test_conversation_embedding(server: RemoteOpenAIServer,
# To be consistent with chat # To be consistent with chat
extra_body={"add_special_tokens": False}, extra_body={"add_special_tokens": False},
) )
completion_embeddings = completion_response.model_dump(mode="json") completion_embeddings = EmbeddingResponse.model_validate(
completion_response.model_dump(mode="json"))
assert chat_embeddings.pop("id") is not None assert chat_embeddings.id is not None
assert completion_embeddings.pop("id") is not None assert completion_embeddings.id is not None
assert chat_embeddings.pop("created") <= completion_embeddings.pop( assert chat_embeddings.created <= completion_embeddings.created
"created") assert chat_embeddings.model_dump(
assert chat_embeddings == completion_embeddings exclude={"id", "created"}) == (completion_embeddings.model_dump(
exclude={"id", "created"}))
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -205,10 +224,13 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI, ...@@ -205,10 +224,13 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
] ]
# test single embedding # test single embedding
embeddings = await client.embeddings.create( embedding_response = await client.embeddings.create(
model=model_name, model=model_name,
input=input_texts, input=input_texts,
extra_body={"truncate_prompt_tokens": 10}) extra_body={"truncate_prompt_tokens": 10})
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json"))
assert embeddings.id is not None assert embeddings.id is not None
assert len(embeddings.data) == 1 assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096 assert len(embeddings.data[0].embedding) == 4096
...@@ -220,10 +242,12 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI, ...@@ -220,10 +242,12 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728, 1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
9901, 340, 2229, 385, 340, 315, 28741, 28804, 2 9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
] ]
embeddings = await client.embeddings.create( embedding_response = await client.embeddings.create(
model=model_name, model=model_name,
input=input_tokens, input=input_tokens,
extra_body={"truncate_prompt_tokens": 10}) extra_body={"truncate_prompt_tokens": 10})
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json"))
assert embeddings.id is not None assert embeddings.id is not None
assert len(embeddings.data) == 1 assert len(embeddings.data) == 1
...@@ -242,10 +266,10 @@ async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI, ...@@ -242,10 +266,10 @@ async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
] ]
with pytest.raises(openai.BadRequestError): with pytest.raises(openai.BadRequestError):
embeddings = await client.embeddings.create( response = await client.embeddings.create(
model=model_name, model=model_name,
input=input_texts, input=input_texts,
extra_body={"truncate_prompt_tokens": 8193}) extra_body={"truncate_prompt_tokens": 8193})
assert "error" in embeddings.object assert "error" in response.object
assert "truncate_prompt_tokens value is greater than max_model_len. "\ assert "truncate_prompt_tokens value is greater than max_model_len. "\
"Please, select a smaller truncation size." in embeddings.message "Please, select a smaller truncation size." in response.message
import base64
import numpy as np
import pytest
import requests
from vllm.entrypoints.openai.protocol import PoolingResponse
from vllm.transformers_utils.tokenizer import get_tokenizer
from ...utils import RemoteOpenAIServer
MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
@pytest.fixture(scope="module")
def server():
args = [
"--task",
"classify",
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--enforce-eager",
"--max-model-len",
"8192",
"--chat-template",
DUMMY_CHAT_TEMPLATE,
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_pooling(server: RemoteOpenAIServer, model_name: str):
input_texts = [
"The chef prepared a delicious meal.",
]
# test single pooling
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": input_texts,
"encoding_format": "float"
},
)
response.raise_for_status()
poolings = PoolingResponse.model_validate(response.json())
assert poolings.id is not None
assert len(poolings.data) == 1
assert len(poolings.data[0].data) == 2
assert poolings.usage.completion_tokens == 0
assert poolings.usage.prompt_tokens == 7
assert poolings.usage.total_tokens == 7
# test using token IDs
input_tokens = [1, 1, 1, 1, 1]
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": input_tokens,
"encoding_format": "float"
},
)
response.raise_for_status()
poolings = PoolingResponse.model_validate(response.json())
assert poolings.id is not None
assert len(poolings.data) == 1
assert len(poolings.data[0].data) == 2
assert poolings.usage.completion_tokens == 0
assert poolings.usage.prompt_tokens == 5
assert poolings.usage.total_tokens == 5
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
# test List[str]
input_texts = [
"The cat sat on the mat.", "A feline was resting on a rug.",
"Stars twinkle brightly in the night sky."
]
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": input_texts,
"encoding_format": "float"
},
)
response.raise_for_status()
poolings = PoolingResponse.model_validate(response.json())
assert poolings.id is not None
assert len(poolings.data) == 3
assert len(poolings.data[0].data) == 2
assert poolings.usage.completion_tokens == 0
assert poolings.usage.prompt_tokens == 25
assert poolings.usage.total_tokens == 25
# test List[List[int]]
input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
[25, 32, 64, 77]]
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": input_tokens,
"encoding_format": "float"
},
)
response.raise_for_status()
poolings = PoolingResponse.model_validate(response.json())
assert poolings.id is not None
assert len(poolings.data) == 4
assert len(poolings.data[0].data) == 2
assert poolings.usage.completion_tokens == 0
assert poolings.usage.prompt_tokens == 17
assert poolings.usage.total_tokens == 17
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_conversation_pooling(server: RemoteOpenAIServer,
model_name: str):
messages = [{
"role": "user",
"content": "The cat sat on the mat.",
}, {
"role": "assistant",
"content": "A feline was resting on a rug.",
}, {
"role": "user",
"content": "Stars twinkle brightly in the night sky.",
}]
chat_response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"messages": messages,
"encoding_format": "float",
},
)
chat_response.raise_for_status()
chat_poolings = PoolingResponse.model_validate(chat_response.json())
tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
prompt = tokenizer.apply_chat_template(
messages,
chat_template=DUMMY_CHAT_TEMPLATE,
add_generation_prompt=True,
continue_final_message=False,
tokenize=False,
)
completions_response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": prompt,
"encoding_format": "float",
# To be consistent with chat
"add_special_tokens": False,
},
)
completions_response.raise_for_status()
completion_poolings = PoolingResponse.model_validate(
completions_response.json())
assert chat_poolings.id is not None
assert completion_poolings.id is not None
assert chat_poolings.created <= completion_poolings.created
assert chat_poolings.model_dump(
exclude={"id", "created"}) == (completion_poolings.model_dump(
exclude={"id", "created"}))
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_base64_pooling(server: RemoteOpenAIServer,
model_name: str):
input_texts = [
"Hello my name is",
"The best thing about vLLM is that it supports many different models"
]
float_response = requests.post(
server.url_for("pooling"),
json={
"input": input_texts,
"model": model_name,
"encoding_format": "float",
},
)
float_response.raise_for_status()
responses_float = PoolingResponse.model_validate(float_response.json())
base64_response = requests.post(
server.url_for("pooling"),
json={
"input": input_texts,
"model": model_name,
"encoding_format": "base64",
},
)
base64_response.raise_for_status()
responses_base64 = PoolingResponse.model_validate(base64_response.json())
decoded_responses_base64_data = []
for data in responses_base64.data:
decoded_responses_base64_data.append(
np.frombuffer(base64.b64decode(data.data),
dtype="float32").tolist())
assert responses_float.data[0].data == decoded_responses_base64_data[0]
assert responses_float.data[1].data == decoded_responses_base64_data[1]
# Default response is float32 decoded from base64 by OpenAI Client
default_response = requests.post(
server.url_for("pooling"),
json={
"input": input_texts,
"model": model_name,
},
)
default_response.raise_for_status()
responses_default = PoolingResponse.model_validate(default_response.json())
assert responses_float.data[0].data == responses_default.data[0].data
assert responses_float.data[1].data == responses_default.data[1].data
import asyncio import asyncio
from contextlib import suppress from contextlib import suppress
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional
from unittest.mock import MagicMock from unittest.mock import MagicMock
from vllm.config import MultiModalConfig from vllm.config import MultiModalConfig
...@@ -32,6 +33,10 @@ class MockModelConfig: ...@@ -32,6 +33,10 @@ class MockModelConfig:
multimodal_config = MultiModalConfig() multimodal_config = MultiModalConfig()
hf_config = MockHFConfig() hf_config = MockHFConfig()
logits_processor_pattern = None logits_processor_pattern = None
diff_sampling_param: Optional[dict] = None
def get_diff_sampling_param(self):
return self.diff_sampling_param or {}
@dataclass @dataclass
...@@ -95,3 +100,59 @@ def test_serving_chat_should_set_correct_max_tokens(): ...@@ -95,3 +100,59 @@ def test_serving_chat_should_set_correct_max_tokens():
asyncio.run(serving_chat.create_chat_completion(req)) asyncio.run(serving_chat.create_chat_completion(req))
assert mock_engine.generate.call_args.args[1].max_tokens == 10 assert mock_engine.generate.call_args.args[1].max_tokens == 10
def test_serving_chat_could_load_correct_generation_config():
mock_model_config = MockModelConfig()
mock_model_config.diff_sampling_param = {
"temperature": 0.5,
"repetition_penalty": 1.05
}
mock_engine = MagicMock(spec=MQLLMEngineClient)
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
# Initialize the serving chat
serving_chat = OpenAIServingChat(mock_engine,
mock_model_config,
BASE_MODEL_PATHS,
response_role="assistant",
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
lora_modules=None,
prompt_adapters=None,
request_logger=None)
req = ChatCompletionRequest(
model=MODEL_NAME,
messages=[{
"role": "user",
"content": "what is 1+1?"
}],
guided_decoding_backend="outlines",
)
with suppress(Exception):
asyncio.run(serving_chat.create_chat_completion(req))
assert mock_engine.generate.call_args.args[1].temperature == 0.5
assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
# Test the param when user set it
req.temperature = 0.1
with suppress(Exception):
asyncio.run(serving_chat.create_chat_completion(req))
assert mock_engine.generate.call_args.args[1].temperature == 0.1
assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
# Test When temperature==0.0
req.temperature = 0.0
with suppress(Exception):
asyncio.run(serving_chat.create_chat_completion(req))
assert mock_engine.generate.call_args.args[1].temperature == 0.0
assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
...@@ -90,6 +90,7 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI, ...@@ -90,6 +90,7 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI,
messages=messages, messages=messages,
max_completion_tokens=10, max_completion_tokens=10,
logprobs=True, logprobs=True,
temperature=0.0,
top_logprobs=5) top_logprobs=5)
assert len(chat_completion.choices) == 1 assert len(chat_completion.choices) == 1
...@@ -182,6 +183,7 @@ async def test_single_chat_session_video_base64encoded( ...@@ -182,6 +183,7 @@ async def test_single_chat_session_video_base64encoded(
messages=messages, messages=messages,
max_completion_tokens=10, max_completion_tokens=10,
logprobs=True, logprobs=True,
temperature=0.0,
top_logprobs=5) top_logprobs=5)
assert len(chat_completion.choices) == 1 assert len(chat_completion.choices) == 1
...@@ -202,6 +204,7 @@ async def test_single_chat_session_video_base64encoded( ...@@ -202,6 +204,7 @@ async def test_single_chat_session_video_base64encoded(
model=model_name, model=model_name,
messages=messages, messages=messages,
max_completion_tokens=10, max_completion_tokens=10,
temperature=0.0,
) )
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0 assert message.content is not None and len(message.content) >= 0
......
...@@ -92,6 +92,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI, ...@@ -92,6 +92,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
messages=messages, messages=messages,
max_completion_tokens=10, max_completion_tokens=10,
logprobs=True, logprobs=True,
temperature=0.0,
top_logprobs=5) top_logprobs=5)
assert len(chat_completion.choices) == 1 assert len(chat_completion.choices) == 1
...@@ -184,6 +185,7 @@ async def test_single_chat_session_image_base64encoded( ...@@ -184,6 +185,7 @@ async def test_single_chat_session_image_base64encoded(
messages=messages, messages=messages,
max_completion_tokens=10, max_completion_tokens=10,
logprobs=True, logprobs=True,
temperature=0.0,
top_logprobs=5) top_logprobs=5)
assert len(chat_completion.choices) == 1 assert len(chat_completion.choices) == 1
...@@ -204,6 +206,7 @@ async def test_single_chat_session_image_base64encoded( ...@@ -204,6 +206,7 @@ async def test_single_chat_session_image_base64encoded(
model=model_name, model=model_name,
messages=messages, messages=messages,
max_completion_tokens=10, max_completion_tokens=10,
temperature=0.0,
) )
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0 assert message.content is not None and len(message.content) >= 0
......
...@@ -2,9 +2,9 @@ from typing import Dict ...@@ -2,9 +2,9 @@ from typing import Dict
import os import os
import pytest import pytest
import pytest_asyncio
import requests import requests
from vllm.entrypoints.openai.protocol import EmbeddingResponse
from vllm.multimodal.utils import encode_image_base64, fetch_image from vllm.multimodal.utils import encode_image_base64, fetch_image
from ...utils import VLLM_PATH, RemoteOpenAIServer, models_path_prefix, urls_port from ...utils import VLLM_PATH, RemoteOpenAIServer, models_path_prefix, urls_port
...@@ -54,12 +54,6 @@ def server(): ...@@ -54,12 +54,6 @@ def server():
yield remote_server yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def base64_encoded_image() -> Dict[str, str]: def base64_encoded_image() -> Dict[str, str]:
return { return {
...@@ -90,18 +84,20 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str, ...@@ -90,18 +84,20 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
], ],
}] }]
response = requests.post(server.url_for("v1/embeddings"), response = requests.post(
json={ server.url_for("v1/embeddings"),
"model": model_name, json={
"messages": messages, "model": model_name,
"encoding_format": "float" "messages": messages,
}) "encoding_format": "float"
},
)
response.raise_for_status() response.raise_for_status()
embeddings = EmbeddingResponse.model_validate(response.json())
embeddings = response.json()
assert embeddings["id"] is not None assert embeddings.id is not None
assert len(embeddings["data"]) == 1 assert len(embeddings.data) == 1
assert len(embeddings["data"][0]["embedding"]) == 3072 assert len(embeddings.data[0].embedding) == 3072
assert embeddings["usage"]["completion_tokens"] == 0 assert embeddings.usage.completion_tokens == 0
assert embeddings["usage"]["prompt_tokens"] == 765 assert embeddings.usage.prompt_tokens == 765
assert embeddings["usage"]["total_tokens"] == 765 assert embeddings.usage.total_tokens == 765
# Adapted from https://github.com/sgl-project/sglang/pull/2575
import itertools
import pytest
import torch
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.fused_moe import fused_moe
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
per_token_group_quant_fp8, w8a8_block_fp8_matmul)
from vllm.platforms import current_platform
if current_platform.get_device_capability() < (9, 0):
pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
allow_module_level=True)
# Test configurations
DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32]
NUM_TOKENS = [7, 83, 2048]
D = [512, 4096, 5120, 13824]
GROUP_SIZE = [64, 128, 256, 512]
M = [1, 7, 83, 512, 2048]
N = [128, 512, 1024, 4096, 7748, 13824]
K = [256, 4096, 5120, 3884, 13824]
# Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
# and its hidden size is 7168.
M_moe = [1, 7, 83, 512, 2048]
N_moe = [4608] # [128, 4608, 13824]
K_moe = [7168] # [256, 7168, 13824]
BLOCK_SIZE = [[128, 128]]
E = [256] # [8, 24, 128, 256]
TOP_KS = [1] # [1, 2, 6]
OUT_DTYPES = [torch.bfloat16] # [torch.float32, torch.half, torch.bfloat16]
SEEDS = [0]
def native_per_token_group_quant_fp8(x,
group_size,
eps=1e-10,
dtype=torch.float8_e4m3fn):
"""Function to perform per-token-group quantization on an input tensor
`x` using native torch."""
assert x.shape[-1] % group_size == 0, ("the last dimension of `x` cannot "
"be divisible by `group_size`")
assert x.is_contiguous(), "`x` is not contiguous"
finfo = torch.finfo(dtype)
fp8_min = finfo.min
fp8_max = finfo.max
x_ = x.reshape(x.numel() // group_size, group_size)
amax = x_.abs().max(dim=-1,
keepdim=True)[0].clamp(min=eps).to(torch.float32)
x_s = amax / fp8_max
x_q = (x_ / x_s).clamp(min=fp8_min, max=fp8_max).to(dtype)
x_q = x_q.reshape(x.shape)
x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size, ))
return x_q, x_s
def native_w8a8_block_fp8_matmul(A,
B,
As,
Bs,
block_size,
output_dtype=torch.float16):
"""Matrix multiplication with block-wise quantization using native torch."""
A = A.to(torch.float32)
B = B.to(torch.float32)
assert A.shape[-1] == B.shape[-1]
assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
assert len(block_size) == 2
block_n, block_k = block_size[0], block_size[1]
assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
assert A.shape[:-1] == As.shape[:-1]
M = A.numel() // A.shape[-1]
N, K = B.shape
origin_C_shape = A.shape[:-1] + (N, )
A = A.reshape(M, A.shape[-1])
As = As.reshape(M, As.shape[-1])
n_tiles = (N + block_n - 1) // block_n
k_tiles = (K + block_k - 1) // block_k
assert n_tiles == Bs.shape[0]
assert k_tiles == Bs.shape[1]
C_shape = (M, N)
C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
A_tiles = [
A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
]
B_tiles = [[
B[j * block_n:min((j + 1) * block_n, N),
i * block_k:min((i + 1) * block_k, K), ] for i in range(k_tiles)
] for j in range(n_tiles)]
C_tiles = [
C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
]
As_tiles = [As[:, i:i + 1] for i in range(k_tiles)]
for i in range(k_tiles):
for j in range(n_tiles):
a = A_tiles[i]
b = B_tiles[j][i]
c = C_tiles[j]
s = As_tiles[i] * Bs[j][i]
c[:, :] += torch.matmul(a, b.t()) * s
C = C.reshape(origin_C_shape).to(output_dtype)
return C
def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
"""Fused moe with block-wise quantization using native torch."""
B, D = a.shape
a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
score = torch.softmax(score, dim=-1, dtype=torch.float32)
topk_weight, topk_ids = torch.topk(score, topk)
topk_weight = topk_weight.view(-1)
topk_ids = topk_ids.view(-1)
_, block_k = block_shape[0], block_shape[1]
a_q, a_s = native_per_token_group_quant_fp8(a, block_k)
a_q = a_q.to(torch.float32)
for i in range(w1.shape[0]):
mask = topk_ids == i
if mask.sum():
inter_out = native_w8a8_block_fp8_matmul(a_q[mask],
w1[i],
a_s[mask],
w1_s[i],
block_shape,
output_dtype=a.dtype)
act_out = SiluAndMul().forward_native(inter_out)
act_out_q, act_out_s = native_per_token_group_quant_fp8(
act_out, block_k)
act_out = act_out.to(torch.float32)
out[mask] = native_w8a8_block_fp8_matmul(act_out_q,
w2[i],
act_out_s,
w2_s[i],
block_shape,
output_dtype=a.dtype)
return (out.view(B, -1, w2.shape[1]) *
topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
# Skip all tests if CUDA is not available
pytest.importorskip("torch.cuda")
@pytest.fixture(autouse=True)
def setup_cuda():
torch.set_default_device("cuda")
@pytest.mark.parametrize("num_tokens,d,dtype,group_size,seed",
itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE,
SEEDS))
@torch.inference_mode()
def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
torch.manual_seed(seed)
x = torch.rand(num_tokens, d, dtype=dtype)
ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size)
out, scale = per_token_group_quant_fp8(x, group_size)
assert torch.allclose(out.to(torch.float32),
ref_out.to(torch.float32),
rtol=0.15)
assert torch.allclose(scale, ref_scale)
@pytest.mark.parametrize("M,N,K,block_size,out_dtype,seed",
itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES,
SEEDS))
@torch.inference_mode()
def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
torch.manual_seed(seed)
factor_for_scale = 1e-2
fp8_info = torch.finfo(torch.float8_e4m3fn)
fp8_max, fp8_min = fp8_info.max, fp8_info.min
A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
block_n, block_k = block_size[0], block_size[1]
n_tiles = (N + block_n - 1) // block_n
k_tiles = (K + block_k - 1) // block_k
As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale
Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
ref_out = native_w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size,
out_dtype)
out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
rel_diff = (torch.mean(
torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
torch.mean(torch.abs(ref_out.to(torch.float32))))
assert rel_diff < 0.001
@pytest.mark.parametrize("M,N,K,E,topk,block_size,dtype,seed",
itertools.product(M_moe, N_moe, K_moe, E, TOP_KS,
BLOCK_SIZE, DTYPES, SEEDS))
@torch.inference_mode()
def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
torch.manual_seed(seed)
factor_for_scale = 1e-2
fp8_info = torch.finfo(torch.float8_e4m3fn)
fp8_max, fp8_min = fp8_info.max, fp8_info.min
a = torch.randn((M, K), dtype=dtype) / 10
w1_bf16 = (torch.rand(
(E, 2 * N, K), dtype=torch.bfloat16) - 0.5) * 2 * fp8_max
w1 = w1_bf16.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
del w1_bf16
w2_bf16 = (torch.rand((E, K, N), dtype=torch.bfloat16) - 0.5) * 2 * fp8_max
w2 = w2_bf16.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
del w2_bf16
block_n, block_k = block_size[0], block_size[1]
n_tiles_w1 = (2 * N + block_n - 1) // block_n
n_tiles_w2 = (K + block_n - 1) // block_n
k_tiles_w1 = (K + block_k - 1) // block_k
k_tiles_w2 = (N + block_k - 1) // block_k
w1_s = torch.rand(
(E, n_tiles_w1, k_tiles_w1), dtype=torch.float32) * factor_for_scale
w2_s = torch.rand(
(E, n_tiles_w2, k_tiles_w2), dtype=torch.float32) * factor_for_scale
score = torch.randn((M, E), dtype=dtype)
out = fused_moe(
a,
w1,
w2,
score,
topk,
renormalize=False,
use_fp8_w8a8=True,
w1_scale=w1_s,
w2_scale=w2_s,
block_shape=block_size,
)
ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk,
block_size)
print(f"{out.sum()=}")
print(f"{ref_out.sum()=}")
rel_diff = (torch.mean(
torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
torch.mean(torch.abs(ref_out.to(torch.float32))))
assert rel_diff < 0.03
"""Tests for sparse cutlass kernels
Run `pytest tests/kernels/test_semi_structured.py`.
"""
from typing import Optional, Tuple, Type
import pytest
import torch
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
sparse_cutlass_supported)
from vllm.platforms import current_platform
CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
]
capability = current_platform.get_device_capability()
capability = capability[0] * 10 + capability[1]
def to_fp8(tensor: torch.Tensor):
finfo = torch.finfo(torch.float8_e4m3fn)
return torch.round(tensor.clamp(
min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
def to_int8(tensor: torch.Tensor):
return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
def rand_int8(shape: tuple, device: str = "cuda"):
return to_int8(torch.rand(shape, device=device) * 255 - 128)
def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
return tensor.to(dtype=torch.bfloat16)
def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
return tensor.to(dtype=torch.float16)
def prune_to_2_4(tensor):
# Reshape tensor to [N, 4] where N is number of groups of 4
original_shape = tensor.shape
reshaped = tensor.reshape(-1, 4)
# Get indices of top 2 absolute values in each group of 4
_, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
# Create binary mask
mask = torch.zeros_like(reshaped)
mask.scatter_(dim=1,
index=indices,
src=torch.ones_like(indices, dtype=mask.dtype))
# Apply mask and reshape back
pruned = reshaped * mask
# Turn all -0.0 to 0.0
pruned[pruned == -0.0] = 0.0
return pruned.reshape(original_shape)
def make_rand_sparse_tensors(
dtype: torch.dtype, m: int, n: int, k: int
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
a = torch.randn((m, k), device='cuda') * 5
b = torch.randn((n, k), device='cuda').t() * 5
b = prune_to_2_4(b.t()).t()
if dtype == torch.int8:
a, b = to_int8(a), to_int8(b)
elif dtype == torch.float8_e4m3fn:
a, b = to_fp8(a), to_fp8(b)
elif dtype == torch.float16:
a, b = to_fp16(a), to_fp16(b)
elif dtype == torch.bfloat16:
a, b = to_bf16(a), to_bf16(b)
else:
raise ValueError("unsupported dtype")
b_compressed, e = ops.cutlass_sparse_compress(b.t())
# Compressed B, Metadata, Original A, B
return b_compressed, e, a, b
def baseline_scaled_mm(a: torch.Tensor,
b: torch.Tensor,
scale_a: torch.Tensor,
scale_b: torch.Tensor,
out_dtype: Type[torch.dtype],
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
output = (scale_a * (scale_b * (torch.mm(
a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
if bias is not None:
output = output + bias
return output
@pytest.mark.skipif(not sparse_cutlass_supported(),
reason="Sparse FP8 is not yet supported on this GPU type.")
# Test working with a subset of A and B for sparse matmul
def test_cutlass_sparse_subset():
big_m = 1024
m, n, k = 512, 512, 512
# Create tensors
b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn,
big_m, n, k)
a = whole_a[0:m, 0:k]
scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
out = ops.cutlass_scaled_sparse_mm(a,
b_comp,
e,
scale_a,
scale_b,
out_dtype=torch.bfloat16)
baseline = baseline_scaled_mm(a,
b,
scale_a,
scale_b,
out_dtype=torch.bfloat16)
torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
...@@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device): ...@@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device):
assert buffer.buffer_size == 0 assert buffer.buffer_size == 0
assert len(buffer.buffer) == 0 assert len(buffer.buffer) == 0
print("Test run passed!") print("My rank: %d, Test run passed!" % (my_rank))
def stress_test(my_rank, buf, device): def stress_test(my_rank, buf, device):
...@@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device): ...@@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device):
else: else:
torch.distributed.send(torch.tensor([n]), 0) torch.distributed.send(torch.tensor([n]), 0)
print("Passed stress test!") print("My rank: %d, Passed stress test!" % (my_rank))
if __name__ == "__main__": if __name__ == "__main__":
......
#!/bin/bash #!/bin/bash
RANK=0 python test_lookup_buffer.py & RANK=0 python3 test_lookup_buffer.py &
RANK=1 python test_lookup_buffer.py & PID0=$!
\ No newline at end of file RANK=1 python3 test_lookup_buffer.py &
PID1=$!
wait $PID0
wait $PID1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment