Commit 4d3a2c28 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.5' into v0.6.5-dev

parents 92ec5d8e 2d1b9baa
from typing import List
import os
import pytest
from vllm import LLM
from ..openai.test_vision import TEST_IMAGE_URLS
from ...utils import models_path_prefix
def test_chat():
llm = LLM(model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"))
prompt1 = "Explain the concept of entropy."
messages = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": prompt1
},
]
outputs = llm.chat(messages)
assert len(outputs) == 1
def test_multi_chat():
llm = LLM(model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"))
prompt1 = "Explain the concept of entropy."
prompt2 = "Explain what among us is."
conversation1 = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": prompt1
},
]
conversation2 = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": prompt2
},
]
messages = [conversation1, conversation2]
outputs = llm.chat(messages)
assert len(outputs) == 2
@pytest.mark.parametrize("image_urls",
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
def test_chat_multi_image(image_urls: List[str]):
llm = LLM(
model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"),
dtype="bfloat16",
max_model_len=4096,
max_num_seqs=5,
enforce_eager=True,
trust_remote_code=True,
limit_mm_per_prompt={"image": 2},
)
messages = [{
"role":
"user",
"content": [
*({
"type": "image_url",
"image_url": {
"url": image_url
}
} for image_url in image_urls),
{
"type": "text",
"text": "What's in this image?"
},
],
}]
outputs = llm.chat(messages)
assert len(outputs) >= 0
...@@ -4,9 +4,8 @@ from typing import List ...@@ -4,9 +4,8 @@ from typing import List
import pytest import pytest
import os import os
from vllm import LLM, EmbeddingRequestOutput, PoolingParams from vllm import LLM, PoolingParams, PoolingRequestOutput
from vllm.distributed import cleanup_dist_env_and_memory
from ...conftest import cleanup
from ...utils import models_path_prefix from ...utils import models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct") MODEL_NAME = os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")
...@@ -43,29 +42,14 @@ def llm(): ...@@ -43,29 +42,14 @@ def llm():
del llm del llm
cleanup() cleanup_dist_env_and_memory()
def assert_outputs_equal(o1: List[EmbeddingRequestOutput], def assert_outputs_equal(o1: List[PoolingRequestOutput],
o2: List[EmbeddingRequestOutput]): o2: List[PoolingRequestOutput]):
assert [o.outputs for o in o1] == [o.outputs for o in o2] assert [o.outputs for o in o1] == [o.outputs for o in o2]
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize('prompt', PROMPTS)
def test_v1_v2_api_consistency_single_prompt_string(llm: LLM, prompt):
pooling_params = PoolingParams()
with pytest.warns(DeprecationWarning, match="'prompts'"):
v1_output = llm.encode(prompts=prompt, pooling_params=pooling_params)
v2_output = llm.encode(prompt, pooling_params=pooling_params)
assert_outputs_equal(v1_output, v2_output)
v2_output = llm.encode({"prompt": prompt}, pooling_params=pooling_params)
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS) @pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM, def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
...@@ -81,25 +65,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM, ...@@ -81,25 +65,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
assert_outputs_equal(v1_output, v2_output) assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
def test_v1_v2_api_consistency_multi_prompt_string(llm: LLM):
pooling_params = PoolingParams()
with pytest.warns(DeprecationWarning, match="'prompts'"):
v1_output = llm.encode(prompts=PROMPTS, pooling_params=pooling_params)
v2_output = llm.encode(PROMPTS, pooling_params=pooling_params)
assert_outputs_equal(v1_output, v2_output)
v2_output = llm.encode(
[{
"prompt": p
} for p in PROMPTS],
pooling_params=pooling_params,
)
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM): def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
pooling_params = PoolingParams() pooling_params = PoolingParams()
......
...@@ -5,9 +5,7 @@ import os ...@@ -5,9 +5,7 @@ import os
import pytest import pytest
from vllm import LLM, RequestOutput, SamplingParams from vllm import LLM, RequestOutput, SamplingParams
from vllm.distributed import cleanup_dist_env_and_memory
from ...conftest import cleanup
from ..openai.test_vision import TEST_IMAGE_URLS
from ...utils import models_path_prefix from ...utils import models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "facebook/opt-125m") MODEL_NAME = os.path.join(models_path_prefix, "facebook/opt-125m")
...@@ -42,30 +40,13 @@ def llm(): ...@@ -42,30 +40,13 @@ def llm():
del llm del llm
cleanup() cleanup_dist_env_and_memory()
def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]): def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]):
assert [o.outputs for o in o1] == [o.outputs for o in o2] assert [o.outputs for o in o1] == [o.outputs for o in o2]
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize('prompt', PROMPTS)
def test_v1_v2_api_consistency_single_prompt_string(llm: LLM, prompt):
sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
with pytest.warns(DeprecationWarning, match="'prompts'"):
v1_output = llm.generate(prompts=prompt,
sampling_params=sampling_params)
v2_output = llm.generate(prompt, sampling_params=sampling_params)
assert_outputs_equal(v1_output, v2_output)
v2_output = llm.generate({"prompt": prompt},
sampling_params=sampling_params)
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS) @pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM, def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
...@@ -81,26 +62,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM, ...@@ -81,26 +62,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
assert_outputs_equal(v1_output, v2_output) assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
def test_v1_v2_api_consistency_multi_prompt_string(llm: LLM):
sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
with pytest.warns(DeprecationWarning, match="'prompts'"):
v1_output = llm.generate(prompts=PROMPTS,
sampling_params=sampling_params)
v2_output = llm.generate(PROMPTS, sampling_params=sampling_params)
assert_outputs_equal(v1_output, v2_output)
v2_output = llm.generate(
[{
"prompt": p
} for p in PROMPTS],
sampling_params=sampling_params,
)
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM): def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
sampling_params = SamplingParams(temperature=0.0, top_p=1.0) sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
...@@ -143,90 +104,3 @@ def test_multiple_sampling_params(llm: LLM): ...@@ -143,90 +104,3 @@ def test_multiple_sampling_params(llm: LLM):
# sampling_params is None, default params should be applied # sampling_params is None, default params should be applied
outputs = llm.generate(PROMPTS, sampling_params=None) outputs = llm.generate(PROMPTS, sampling_params=None)
assert len(PROMPTS) == len(outputs) assert len(PROMPTS) == len(outputs)
def test_chat():
llm = LLM(model=os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"))
prompt1 = "Explain the concept of entropy."
messages = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": prompt1
},
]
outputs = llm.chat(messages)
assert len(outputs) == 1
def test_multi_chat():
llm = LLM(model=os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"))
prompt1 = "Explain the concept of entropy."
prompt2 = "Explain what among us is."
conversation1 = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": prompt1
},
]
conversation2 = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": prompt2
},
]
messages = [conversation1, conversation2]
outputs = llm.chat(messages)
assert len(outputs) == 2
@pytest.mark.parametrize("image_urls",
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
def test_chat_multi_image(image_urls: List[str]):
llm = LLM(
model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"),
dtype="bfloat16",
max_model_len=4096,
max_num_seqs=5,
enforce_eager=True,
trust_remote_code=True,
limit_mm_per_prompt={"image": 2},
)
messages = [{
"role":
"user",
"content": [
*({
"type": "image_url",
"image_url": {
"url": image_url
}
} for image_url in image_urls),
{
"type": "text",
"text": "What's in this image?"
},
],
}]
outputs = llm.chat(messages)
assert len(outputs) >= 0
...@@ -6,9 +6,8 @@ import os ...@@ -6,9 +6,8 @@ import os
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from vllm import LLM from vllm import LLM
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from ...conftest import cleanup
from ...utils import models_path_prefix from ...utils import models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta") MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
...@@ -41,7 +40,7 @@ def llm(): ...@@ -41,7 +40,7 @@ def llm():
del llm del llm
cleanup() cleanup_dist_env_and_memory()
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
......
import os
from vllm import LLM, SamplingParams
from ...utils import models_path_prefix
def test_gpu_memory_utilization():
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# makes sure gpu_memory_utilization is per-instance limit,
# not a global limit
llms = [
LLM(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
gpu_memory_utilization=0.3,
enforce_eager=True) for i in range(3)
]
for llm in llms:
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
...@@ -6,11 +6,11 @@ import jsonschema ...@@ -6,11 +6,11 @@ import jsonschema
import pytest import pytest
import os import os
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.entrypoints.llm import LLM from vllm.entrypoints.llm import LLM
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.sampling_params import SamplingParams
from ...conftest import cleanup from vllm.sampling_params import GuidedDecodingParams, SamplingParams
from ...utils import models_path_prefix from ...utils import models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta") MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
...@@ -25,7 +25,7 @@ def llm(): ...@@ -25,7 +25,7 @@ def llm():
with llm.deprecate_legacy_api(): with llm.deprecate_legacy_api():
yield weakref.proxy(llm) yield weakref.proxy(llm)
del llm del llm
cleanup() cleanup_dist_env_and_memory()
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
...@@ -33,14 +33,12 @@ def test_guided_regex(sample_regex, llm): ...@@ -33,14 +33,12 @@ def test_guided_regex(sample_regex, llm):
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=0.8, temperature=0.8,
top_p=0.95, top_p=0.95,
) guided_decoding=GuidedDecodingParams(regex=sample_regex))
outputs = llm.generate( outputs = llm.generate(prompts=[
prompts=[ f"Give an example IPv4 address with this regex: {sample_regex}"
f"Give an example IPv4 address with this regex: {sample_regex}" ] * 2,
] * 2, sampling_params=sampling_params,
sampling_params=sampling_params, use_tqdm=True)
use_tqdm=True,
guided_options_request=dict(guided_regex=sample_regex))
assert outputs is not None assert outputs is not None
for output in outputs: for output in outputs:
...@@ -59,15 +57,13 @@ def test_guided_json_completion(sample_json_schema, llm): ...@@ -59,15 +57,13 @@ def test_guided_json_completion(sample_json_schema, llm):
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=1.0, temperature=1.0,
max_tokens=1000, max_tokens=1000,
) guided_decoding=GuidedDecodingParams(json=sample_json_schema))
outputs = llm.generate( outputs = llm.generate(prompts=[
prompts=[ f"Give an example JSON for an employee profile "
f"Give an example JSON for an employee profile " f"that fits this schema: {sample_json_schema}"
f"that fits this schema: {sample_json_schema}" ] * 2,
] * 2, sampling_params=sampling_params,
sampling_params=sampling_params, use_tqdm=True)
use_tqdm=True,
guided_options_request=dict(guided_json=sample_json_schema))
assert outputs is not None assert outputs is not None
...@@ -83,17 +79,72 @@ def test_guided_json_completion(sample_json_schema, llm): ...@@ -83,17 +79,72 @@ def test_guided_json_completion(sample_json_schema, llm):
jsonschema.validate(instance=output_json, schema=sample_json_schema) jsonschema.validate(instance=output_json, schema=sample_json_schema)
@pytest.mark.skip_global_cleanup
def test_guided_complex_json_completion(sample_complex_json_schema, llm):
sampling_params = SamplingParams(
temperature=1.0,
max_tokens=1000,
guided_decoding=GuidedDecodingParams(json=sample_complex_json_schema))
outputs = llm.generate(prompts=[
f"Give an example JSON for an assignment grade "
f"that fits this schema: {sample_complex_json_schema}"
] * 2,
sampling_params=sampling_params,
use_tqdm=True)
assert outputs is not None
for output in outputs:
assert output is not None
assert isinstance(output, RequestOutput)
prompt = output.prompt
generated_text = output.outputs[0].text
assert generated_text is not None
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
output_json = json.loads(generated_text)
jsonschema.validate(instance=output_json,
schema=sample_complex_json_schema)
@pytest.mark.skip_global_cleanup
def test_guided_definition_json_completion(sample_definition_json_schema, llm):
sampling_params = SamplingParams(temperature=1.0,
max_tokens=1000,
guided_decoding=GuidedDecodingParams(
json=sample_definition_json_schema))
outputs = llm.generate(prompts=[
f"Give an example JSON for solving 8x + 7 = -23 "
f"that fits this schema: {sample_definition_json_schema}"
] * 2,
sampling_params=sampling_params,
use_tqdm=True)
assert outputs is not None
for output in outputs:
assert output is not None
assert isinstance(output, RequestOutput)
prompt = output.prompt
generated_text = output.outputs[0].text
assert generated_text is not None
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
output_json = json.loads(generated_text)
jsonschema.validate(instance=output_json,
schema=sample_definition_json_schema)
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
def test_guided_choice_completion(sample_guided_choice, llm): def test_guided_choice_completion(sample_guided_choice, llm):
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=0.8, temperature=0.8,
top_p=0.95, top_p=0.95,
) guided_decoding=GuidedDecodingParams(choice=sample_guided_choice))
outputs = llm.generate( outputs = llm.generate(
prompts="The best language for type-safe systems programming is ", prompts="The best language for type-safe systems programming is ",
sampling_params=sampling_params, sampling_params=sampling_params,
use_tqdm=True, use_tqdm=True)
guided_options_request=dict(guided_choice=sample_guided_choice))
assert outputs is not None assert outputs is not None
for output in outputs: for output in outputs:
...@@ -114,13 +165,13 @@ def test_guided_grammar(sample_sql_statements, llm): ...@@ -114,13 +165,13 @@ def test_guided_grammar(sample_sql_statements, llm):
temperature=0.8, temperature=0.8,
top_p=0.95, top_p=0.95,
max_tokens=1000, max_tokens=1000,
) guided_decoding=GuidedDecodingParams(grammar=sample_sql_statements))
outputs = llm.generate( outputs = llm.generate(
prompts=("Generate a sql state that select col_1 from " prompts=("Generate a sql state that select col_1 from "
"table_1 where it is equals to 1"), "table_1 where it is equals to 1"),
sampling_params=sampling_params, sampling_params=sampling_params,
use_tqdm=True, use_tqdm=True,
guided_options_request=dict(guided_grammar=sample_sql_statements)) )
assert outputs is not None assert outputs is not None
for output in outputs: for output in outputs:
...@@ -142,3 +193,55 @@ def test_guided_grammar(sample_sql_statements, llm): ...@@ -142,3 +193,55 @@ def test_guided_grammar(sample_sql_statements, llm):
assert generated_text.strip() == ground_truth assert generated_text.strip() == ground_truth
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
@pytest.mark.skip_global_cleanup
def test_guided_options_request_deprecation_warning(sample_regex, llm):
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
with pytest.warns(DeprecationWarning, match="guided_options_request"):
llm.generate(prompts="This should fail",
sampling_params=sampling_params,
use_tqdm=True,
guided_options_request=dict(guided_regex=sample_regex))
@pytest.mark.skip_global_cleanup
def test_validation_against_both_guided_decoding_options(sample_regex, llm):
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
guided_decoding=GuidedDecodingParams(regex=sample_regex))
with pytest.raises(ValueError, match="Cannot set both"):
llm.generate(prompts="This should fail",
sampling_params=sampling_params,
use_tqdm=True,
guided_options_request=dict(guided_regex=sample_regex))
@pytest.mark.skip_global_cleanup
def test_guided_json_object(llm):
sampling_params = SamplingParams(
temperature=1.0,
max_tokens=100,
guided_decoding=GuidedDecodingParams(json_object=True))
outputs = llm.generate(
prompts=("Generate a JSON object describing a person with name "
"and age for John Smith who is 31 years old."),
sampling_params=sampling_params,
use_tqdm=True)
assert outputs is not None
for output in outputs:
assert output is not None
assert isinstance(output, RequestOutput)
generated_text = output.outputs[0].text
print(generated_text)
assert generated_text is not None
# Parse to verify it is valid JSON
parsed_json = json.loads(generated_text)
assert isinstance(parsed_json, dict)
import os
import pytest
from vllm import LLM
from ...utils import error_on_warning, models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "facebook/opt-125m")
def test_pos_args_deprecated():
with error_on_warning(DeprecationWarning):
LLM(model=MODEL_NAME, tokenizer=MODEL_NAME)
with error_on_warning(DeprecationWarning):
LLM(MODEL_NAME, tokenizer=MODEL_NAME)
with pytest.warns(DeprecationWarning, match="'tokenizer'"):
LLM(MODEL_NAME, MODEL_NAME)
with pytest.warns(DeprecationWarning,
match="'tokenizer', 'tokenizer_mode'"):
LLM(MODEL_NAME, MODEL_NAME, "auto")
import sys import sys
import os import os
from contextlib import nullcontext
from vllm_test_utils import BlameResult, blame
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.distributed import cleanup_dist_env_and_memory
from ...utils import models_path_prefix from ...utils import models_path_prefix
def test_lazy_outlines(sample_regex): def run_normal():
"""If users don't use guided decoding, outlines should not be imported.
"""
prompts = [ prompts = [
"Hello, my name is", "Hello, my name is",
"The president of the United States is", "The president of the United States is",
...@@ -16,6 +18,7 @@ def test_lazy_outlines(sample_regex): ...@@ -16,6 +18,7 @@ def test_lazy_outlines(sample_regex):
] ]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM without guided decoding as a baseline.
llm = LLM(model=os.path.join(models_path_prefix, "facebook/opt-125m"), llm = LLM(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
enforce_eager=True, enforce_eager=True,
gpu_memory_utilization=0.3) gpu_memory_utilization=0.3)
...@@ -25,9 +28,13 @@ def test_lazy_outlines(sample_regex): ...@@ -25,9 +28,13 @@ def test_lazy_outlines(sample_regex):
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
# make sure outlines is not imported # Destroy the LLM object and free up the GPU memory.
assert 'outlines' not in sys.modules del llm
cleanup_dist_env_and_memory()
def run_lmfe(sample_regex):
# Create an LLM with guided decoding enabled.
llm = LLM(model=os.path.join(models_path_prefix, "facebook/opt-125m"), llm = LLM(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
enforce_eager=True, enforce_eager=True,
guided_decoding_backend="lm-format-enforcer", guided_decoding_backend="lm-format-enforcer",
...@@ -46,5 +53,26 @@ def test_lazy_outlines(sample_regex): ...@@ -46,5 +53,26 @@ def test_lazy_outlines(sample_regex):
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
def test_lazy_outlines(sample_regex):
"""If users don't use guided decoding, outlines should not be imported.
"""
# make sure outlines is not imported # make sure outlines is not imported
assert 'outlines' not in sys.modules module_name = "outlines"
# In CI, we only check finally if the module is imported.
# If it is indeed imported, we can rerun the test with `use_blame=True`,
# which will trace every function call to find the first import location,
# and help find the root cause.
# We don't run it in CI by default because it is slow.
use_blame = False
context = blame(
lambda: module_name in sys.modules) if use_blame else nullcontext()
with context as result:
run_normal()
run_lmfe(sample_regex)
if use_blame:
assert isinstance(result, BlameResult)
print(f"the first import location is:\n{result.trace_stack}")
assert module_name not in sys.modules, (
f"Module {module_name} is imported. To see the first"
f" import location, run the test with `use_blame=True`.")
...@@ -5,7 +5,22 @@ from vllm import LLM ...@@ -5,7 +5,22 @@ from vllm import LLM
from ...utils import models_path_prefix from ...utils import models_path_prefix
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def test_empty_prompt(): def test_empty_prompt():
llm = LLM(model=os.path.join(models_path_prefix, "gpt2")) llm = LLM(model=os.path.join(models_path_prefix, "gpt2"), enforce_eager=True)
with pytest.raises(ValueError, match='Prompt cannot be empty'): with pytest.raises(ValueError, match='Prompt cannot be empty'):
llm.generate([""]) llm.generate([""])
@pytest.mark.skip_v1
def test_out_of_vocab_token():
llm = LLM(model=os.path.join(models_path_prefix, "gpt2"), enforce_eager=True)
with pytest.raises(ValueError, match='out of vocabulary'):
llm.generate({"prompt_token_ids": [999999]})
"""Tests for HF_HUB_OFFLINE mode""" """Tests for HF_HUB_OFFLINE mode"""
import importlib import importlib
import sys import sys
import weakref
import os import os
import pytest import pytest
from vllm import LLM from vllm import LLM
from vllm.distributed import cleanup_dist_env_and_memory
from ...conftest import cleanup
from ...utils import models_path_prefix from ...utils import models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "facebook/opt-125m")
@pytest.fixture(scope="module") MODEL_CONFIGS = [
def llm(): {
# pytest caches the fixture so we use weakref.proxy to "model": os.path.join(models_path_prefix, "facebook/opt-125m"),
# enable garbage collection "enforce_eager": True,
llm = LLM(model=MODEL_NAME, "gpu_memory_utilization": 0.20,
max_num_batched_tokens=4096, "max_model_len": 64,
tensor_parallel_size=1, "max_num_batched_tokens": 64,
gpu_memory_utilization=0.10, "max_num_seqs": 64,
enforce_eager=True) "tensor_parallel_size": 1,
},
{
"model": os.path.join(models_path_prefix, "mistralai/Mistral-7B-Instruct-v0.1"),
"enforce_eager": True,
"gpu_memory_utilization": 0.95,
"max_model_len": 64,
"max_num_batched_tokens": 64,
"max_num_seqs": 64,
"tensor_parallel_size": 1,
"tokenizer_mode": "mistral",
},
]
with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
del llm @pytest.fixture(scope="module")
def cache_models():
# Cache model files first
for model_config in MODEL_CONFIGS:
LLM(**model_config)
cleanup_dist_env_and_memory()
cleanup() yield
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
def test_offline_mode(llm: LLM, monkeypatch): @pytest.mark.usefixtures("cache_models")
# we use the llm fixture to ensure the model files are in-cache def test_offline_mode(monkeypatch):
del llm
# Set HF to offline mode and ensure we can still construct an LLM # Set HF to offline mode and ensure we can still construct an LLM
try: try:
monkeypatch.setenv("HF_HUB_OFFLINE", "1") monkeypatch.setenv("HF_HUB_OFFLINE", "1")
# Need to re-import huggingface_hub and friends to setup offline mode # Need to re-import huggingface_hub and friends to setup offline mode
_re_import_modules() _re_import_modules()
# Cached model files should be used in offline mode # Cached model files should be used in offline mode
LLM(model=MODEL_NAME, for model_config in MODEL_CONFIGS:
max_num_batched_tokens=4096, LLM(**model_config)
tensor_parallel_size=1,
gpu_memory_utilization=0.10,
enforce_eager=True)
finally: finally:
# Reset the environment after the test # Reset the environment after the test
# NB: Assuming tests are run in online mode # NB: Assuming tests are run in online mode
......
...@@ -11,6 +11,8 @@ import lm_eval ...@@ -11,6 +11,8 @@ import lm_eval
import pytest import pytest
import os import os
from vllm.platforms import current_platform
from ...utils import RemoteOpenAIServer, models_path_prefix from ...utils import RemoteOpenAIServer, models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct") MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct")
...@@ -19,22 +21,33 @@ TASK = "gsm8k" ...@@ -19,22 +21,33 @@ TASK = "gsm8k"
FILTER = "exact_match,strict-match" FILTER = "exact_match,strict-match"
RTOL = 0.03 RTOL = 0.03
EXPECTED_VALUE = 0.58 EXPECTED_VALUE = 0.58
DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"] DEFAULT_ARGS = ["--max-model-len", "2048", "--disable-log-requests"]
MORE_ARGS_LIST = [ MORE_ARGS_LIST = [
[], # Default
["--enable-chunked-prefill"], # Chunked ["--enable-chunked-prefill"], # Chunked
["--num-scheduler-steps", "8"], # MS ["--num-scheduler-steps", "8"], # MS
["--num-scheduler-steps", "8", "--multi-step-stream-outputs"] # MS+Stream ["--num-scheduler-steps", "8", "--multi-step-stream-outputs"] # MS+Stream
] ]
MAX_WAIT_SECONDS = None
if current_platform.is_tpu():
MORE_ARGS_LIST = [
[], # Default
# ["--num-scheduler-steps", "8"], # Multi-step << currently fails
]
MAX_WAIT_SECONDS = 600
def run_test(more_args):
"""Run the end to end accuracy test."""
@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
def test_lm_eval_accuracy(more_args):
args = list(DEFAULT_ARGS) args = list(DEFAULT_ARGS)
args.extend(more_args) args.extend(more_args)
print(f"Running with: {args}") print(f"Running with: {args}")
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(
MODEL_NAME, args,
max_wait_seconds=MAX_WAIT_SECONDS) as remote_server:
url = f"{remote_server.url_for('v1')}/completions" url = f"{remote_server.url_for('v1')}/completions"
model_args = ( model_args = (
...@@ -52,3 +65,22 @@ def test_lm_eval_accuracy(more_args): ...@@ -52,3 +65,22 @@ def test_lm_eval_accuracy(more_args):
assert (measured_value - RTOL < EXPECTED_VALUE assert (measured_value - RTOL < EXPECTED_VALUE
and measured_value + RTOL > EXPECTED_VALUE and measured_value + RTOL > EXPECTED_VALUE
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}" ), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
@pytest.mark.skipif(not current_platform.is_cuda(),
reason="V1 currently only supported on CUDA")
def test_lm_eval_accuracy_v1_engine(monkeypatch):
"""Run with the V1 Engine."""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
run_test([])
@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args):
"""Run with the V0 Engine."""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "0")
run_test(more_args)
import asyncio
import contextlib
import random
import time
from typing import Callable
import os
import openai
import pytest
import pytest_asyncio
import requests
from tests.utils import RemoteOpenAIServer
from ...utils import models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct")
@pytest.fixture(scope="module")
def server(): # noqa: F811
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
"--max-num-seqs",
"128",
"--load-format",
"dummy",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize(
ids=["completion", "chat"],
argnames=["create_func_gen", "content_body"],
argvalues=[
(lambda x: x.completions.create, {
"prompt": " ".join(['A'] * 10_000)
}),
(lambda x: x.chat.completions.create, {
"messages": [{
"role": "user",
"content": " ".join(['A'] * 10_000)
}]
}),
],
)
async def test_with_and_without_truncate(
server: RemoteOpenAIServer,
client: openai.AsyncOpenAI,
create_func_gen: Callable,
content_body: dict,
):
create_func = create_func_gen(client)
body = {"model": MODEL_NAME, **content_body, "max_tokens": 10}
num_requests = 10
truncate_prompt_tokens = ([1000] * (num_requests // 2) + [None] *
(num_requests - num_requests // 2))
random.shuffle(truncate_prompt_tokens)
bodies = [{
**body, "extra_body": {
'truncate_prompt_tokens': t
}
} for t in truncate_prompt_tokens]
async def get_status_code(**kwargs):
try:
await create_func(**kwargs)
return 200
except openai.APIStatusError as e:
return e.status_code
responses = await asyncio.gather(*[get_status_code(**b) for b in bodies])
assert 500 not in responses
@pytest.mark.asyncio
@pytest.mark.parametrize(
ids=["single completion", "multiple completions", "chat"],
argnames=["create_func_gen", "content_body"],
argvalues=[
(lambda x: x.completions.create, {
"prompt": " ".join(['A'] * 300_000)
}),
(lambda x: x.completions.create, {
"prompt": [" ".join(['A'] * 300_000)] * 2
}),
(lambda x: x.chat.completions.create, {
"messages": [{
"role": "user",
"content": " ".join(['A'] * 300_000)
}]
}),
],
)
async def test_healthcheck_response_time(
server: RemoteOpenAIServer,
client: openai.AsyncOpenAI,
create_func_gen: Callable,
content_body: dict,
):
num_requests = 50
create_func = create_func_gen(client)
body = {"model": MODEL_NAME, **content_body, "max_tokens": 10}
def get_response_time(url):
start_time = time.monotonic()
res = requests.get(url)
end_time = time.monotonic()
assert res.status_code == 200
return end_time - start_time
no_load_response_time = get_response_time(server.url_for("health"))
tasks = [
asyncio.create_task(create_func(**body)) for _ in range(num_requests)
]
await asyncio.sleep(1) # give the tasks a chance to start running
load_response_time = get_response_time(server.url_for("health"))
with contextlib.suppress(openai.APIStatusError):
await asyncio.gather(*tasks)
assert load_response_time < 100 * no_load_response_time
assert load_response_time < 0.1
...@@ -24,8 +24,11 @@ def server(): ...@@ -24,8 +24,11 @@ def server():
"--dtype", "--dtype",
"bfloat16", "bfloat16",
"--max-model-len", "--max-model-len",
"4096", "2048",
"--max-num-seqs",
"5",
"--enforce-eager", "--enforce-eager",
"--trust-remote-code",
] ]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
...@@ -69,11 +72,12 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI, ...@@ -69,11 +72,12 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
}] }]
# test single completion # test single completion
chat_completion = await client.chat.completions.create(model=model_name, chat_completion = await client.chat.completions.create(
messages=messages, model=model_name,
max_tokens=10, messages=messages,
logprobs=True, max_completion_tokens=10,
top_logprobs=5) logprobs=True,
top_logprobs=5)
assert len(chat_completion.choices) == 1 assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0] choice = chat_completion.choices[0]
...@@ -92,7 +96,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI, ...@@ -92,7 +96,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=model_name, model=model_name,
messages=messages, messages=messages,
max_tokens=10, max_completion_tokens=10,
) )
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0 assert message.content is not None and len(message.content) >= 0
...@@ -124,11 +128,12 @@ async def test_single_chat_session_audio_base64encoded( ...@@ -124,11 +128,12 @@ async def test_single_chat_session_audio_base64encoded(
}] }]
# test single completion # test single completion
chat_completion = await client.chat.completions.create(model=model_name, chat_completion = await client.chat.completions.create(
messages=messages, model=model_name,
max_tokens=10, messages=messages,
logprobs=True, max_completion_tokens=10,
top_logprobs=5) logprobs=True,
top_logprobs=5)
assert len(chat_completion.choices) == 1 assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0] choice = chat_completion.choices[0]
...@@ -147,7 +152,62 @@ async def test_single_chat_session_audio_base64encoded( ...@@ -147,7 +152,62 @@ async def test_single_chat_session_audio_base64encoded(
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=model_name, model=model_name,
messages=messages, messages=messages,
max_tokens=10, max_completion_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
async def test_single_chat_session_input_audio(
client: openai.AsyncOpenAI, model_name: str, audio_url: str,
base64_encoded_audio: Dict[str, str]):
messages = [{
"role":
"user",
"content": [
{
"type": "input_audio",
"input_audio": {
"data": base64_encoded_audio[audio_url],
"format": "wav"
}
},
{
"type": "text",
"text": "What's happening in this audio?"
},
],
}]
# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
logprobs=True,
top_logprobs=5)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=202, total_tokens=212)
message = choice.message
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})
# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
) )
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0 assert message.content is not None and len(message.content) >= 0
...@@ -179,7 +239,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI, ...@@ -179,7 +239,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=model_name, model=model_name,
messages=messages, messages=messages,
max_tokens=10, max_completion_tokens=10,
temperature=0.0, temperature=0.0,
) )
output = chat_completion.choices[0].message.content output = chat_completion.choices[0].message.content
...@@ -189,7 +249,67 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI, ...@@ -189,7 +249,67 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
stream = await client.chat.completions.create( stream = await client.chat.completions.create(
model=model_name, model=model_name,
messages=messages, messages=messages,
max_tokens=10, max_completion_tokens=10,
temperature=0.0,
stream=True,
)
chunks: List[str] = []
finish_reason_count = 0
async for chunk in stream:
delta = chunk.choices[0].delta
if delta.role:
assert delta.role == "assistant"
if delta.content:
chunks.append(delta.content)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
# finish reason should only return in last block
assert finish_reason_count == 1
assert chunk.choices[0].finish_reason == stop_reason
assert delta.content
assert "".join(chunks) == output
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
model_name: str, audio_url: str,
base64_encoded_audio: Dict[str,
str]):
messages = [{
"role":
"user",
"content": [
{
"type": "input_audio",
"input_audio": {
"data": base64_encoded_audio[audio_url],
"format": "wav"
}
},
{
"type": "text",
"text": "What's happening in this audio?"
},
],
}]
# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
output = chat_completion.choices[0].message.content
stop_reason = chat_completion.choices[0].finish_reason
# test streaming
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0, temperature=0.0,
stream=True, stream=True,
) )
...@@ -214,7 +334,8 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI, ...@@ -214,7 +334,8 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str, async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
audio_url: str): audio_url: str,
base64_encoded_audio: Dict[str, str]):
messages = [{ messages = [{
"role": "role":
...@@ -227,9 +348,10 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str, ...@@ -227,9 +348,10 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
} }
}, },
{ {
"type": "audio_url", "type": "input_audio",
"audio_url": { "input_audio": {
"url": audio_url "data": base64_encoded_audio[audio_url],
"format": "wav"
} }
}, },
{ {
...@@ -243,7 +365,7 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str, ...@@ -243,7 +365,7 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
await client.chat.completions.create( await client.chat.completions.create(
model=model_name, model=model_name,
messages=messages, messages=messages,
max_tokens=10, max_completion_tokens=10,
temperature=0.0, temperature=0.0,
) )
......
import asyncio
from http import HTTPStatus from http import HTTPStatus
from typing import List
import openai import openai
import pytest import pytest
...@@ -13,8 +15,44 @@ from ...utils import RemoteOpenAIServer, models_path_prefix ...@@ -13,8 +15,44 @@ from ...utils import RemoteOpenAIServer, models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta") MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
@pytest.fixture(scope='module')
def server_args(request: pytest.FixtureRequest) -> List[str]:
""" Provide extra arguments to the server via indirect parametrization
Usage:
>>> @pytest.mark.parametrize(
>>> "server_args",
>>> [
>>> ["--disable-frontend-multiprocessing"],
>>> [
>>> "--model=NousResearch/Hermes-3-Llama-3.1-70B",
>>> "--enable-auto-tool-choice",
>>> ],
>>> ],
>>> indirect=True,
>>> )
>>> def test_foo(server, client):
>>> ...
This will run `test_foo` twice with servers with:
- `--disable-frontend-multiprocessing`
- `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`.
"""
if not hasattr(request, "param"):
return []
val = request.param
if isinstance(val, str):
return [val]
return request.param
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server(server_args):
args = [ args = [
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",
...@@ -24,6 +62,7 @@ def server(): ...@@ -24,6 +62,7 @@ def server():
"--enforce-eager", "--enforce-eager",
"--max-num-seqs", "--max-num-seqs",
"128", "128",
*server_args,
] ]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
...@@ -36,20 +75,83 @@ async def client(server): ...@@ -36,20 +75,83 @@ async def client(server):
yield async_client yield async_client
@pytest.mark.parametrize(
"server_args",
[
pytest.param([], id="default-frontend-multiprocessing"),
pytest.param(["--disable-frontend-multiprocessing"],
id="disable-frontend-multiprocessing")
],
indirect=True,
)
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_show_version(client: openai.AsyncOpenAI): async def test_show_version(server: RemoteOpenAIServer):
base_url = str(client.base_url)[:-3].strip("/") response = requests.get(server.url_for("version"))
response = requests.get(base_url + "/version")
response.raise_for_status() response.raise_for_status()
assert response.json() == {"version": VLLM_VERSION} assert response.json() == {"version": VLLM_VERSION}
@pytest.mark.parametrize(
"server_args",
[
pytest.param([], id="default-frontend-multiprocessing"),
pytest.param(["--disable-frontend-multiprocessing"],
id="disable-frontend-multiprocessing")
],
indirect=True,
)
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_check_health(client: openai.AsyncOpenAI): async def test_check_health(server: RemoteOpenAIServer):
base_url = str(client.base_url)[:-3].strip("/") response = requests.get(server.url_for("health"))
response = requests.get(base_url + "/health")
assert response.status_code == HTTPStatus.OK assert response.status_code == HTTPStatus.OK
@pytest.mark.parametrize(
"server_args",
[
pytest.param(["--max-model-len", "10100"],
id="default-frontend-multiprocessing"),
pytest.param(
["--disable-frontend-multiprocessing", "--max-model-len", "10100"],
id="disable-frontend-multiprocessing")
],
indirect=True,
)
@pytest.mark.asyncio
async def test_request_cancellation(server: RemoteOpenAIServer):
# clunky test: send an ungodly amount of load in with short timeouts
# then ensure that it still responds quickly afterwards
chat_input = [{"role": "user", "content": "Write a long story"}]
client = server.get_async_client(timeout=0.5)
tasks = []
# Request about 2 million tokens
for _ in range(200):
task = asyncio.create_task(
client.chat.completions.create(messages=chat_input,
model=MODEL_NAME,
max_tokens=10000,
extra_body={"min_tokens": 10000}))
tasks.append(task)
done, pending = await asyncio.wait(tasks,
return_when=asyncio.ALL_COMPLETED)
# Make sure all requests were sent to the server and timed out
# (We don't want to hide other errors like 400s that would invalidate this
# test)
assert len(pending) == 0
for d in done:
with pytest.raises(openai.APITimeoutError):
d.result()
# If the server had not cancelled all the other requests, then it would not
# be able to respond to this one within the timeout
client = server.get_async_client(timeout=5)
response = await client.chat.completions.create(messages=chat_input,
model=MODEL_NAME,
max_tokens=10)
assert len(response.choices) == 1
...@@ -17,9 +17,6 @@ from .test_completion import zephyr_lora_files # noqa: F401 ...@@ -17,9 +17,6 @@ from .test_completion import zephyr_lora_files # noqa: F401
# any model with a chat template should work here # any model with a chat template should work here
MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta") MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here
LORA_NAME = os.path.join(models_path_prefix, "typeof/zephyr-7b-beta-lora")
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
...@@ -69,11 +66,12 @@ async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): ...@@ -69,11 +66,12 @@ async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
"content": "what is 1+1?" "content": "what is 1+1?"
}] }]
chat_completion = await client.chat.completions.create(model=model_name, chat_completion = await client.chat.completions.create(
messages=messages, model=model_name,
max_tokens=5, messages=messages,
temperature=0.0, max_completion_tokens=5,
logprobs=False) temperature=0.0,
logprobs=False)
choice = chat_completion.choices[0] choice = chat_completion.choices[0]
assert choice.logprobs is None assert choice.logprobs is None
...@@ -94,12 +92,13 @@ async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): ...@@ -94,12 +92,13 @@ async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
"content": "what is 1+1?" "content": "what is 1+1?"
}] }]
chat_completion = await client.chat.completions.create(model=model_name, chat_completion = await client.chat.completions.create(
messages=messages, model=model_name,
max_tokens=5, messages=messages,
temperature=0.0, max_completion_tokens=5,
logprobs=True, temperature=0.0,
top_logprobs=0) logprobs=True,
top_logprobs=0)
choice = chat_completion.choices[0] choice = chat_completion.choices[0]
assert choice.logprobs is not None assert choice.logprobs is not None
...@@ -121,12 +120,13 @@ async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): ...@@ -121,12 +120,13 @@ async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
"content": "what is 1+1?" "content": "what is 1+1?"
}] }]
chat_completion = await client.chat.completions.create(model=model_name, chat_completion = await client.chat.completions.create(
messages=messages, model=model_name,
max_tokens=5, messages=messages,
temperature=0.0, max_completion_tokens=5,
logprobs=True, temperature=0.0,
top_logprobs=5) logprobs=True,
top_logprobs=5)
choice = chat_completion.choices[0] choice = chat_completion.choices[0]
assert choice.logprobs is not None assert choice.logprobs is not None
...@@ -153,7 +153,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, ...@@ -153,7 +153,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
with pytest.raises((openai.BadRequestError, openai.APIError)): with pytest.raises((openai.BadRequestError, openai.APIError)):
stream = await client.chat.completions.create(model=model_name, stream = await client.chat.completions.create(model=model_name,
messages=messages, messages=messages,
max_tokens=10, max_completion_tokens=10,
logprobs=True, logprobs=True,
top_logprobs=21, top_logprobs=21,
stream=True) stream=True)
...@@ -163,16 +163,17 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, ...@@ -163,16 +163,17 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
with pytest.raises(openai.BadRequestError): with pytest.raises(openai.BadRequestError):
await client.chat.completions.create(model=model_name, await client.chat.completions.create(model=model_name,
messages=messages, messages=messages,
max_tokens=10, max_completion_tokens=10,
logprobs=True, logprobs=True,
top_logprobs=30, top_logprobs=30,
stream=False) stream=False)
# the server should still work afterwards # the server should still work afterwards
chat_completion = await client.chat.completions.create(model=model_name, chat_completion = await client.chat.completions.create(
messages=messages, model=model_name,
max_tokens=10, messages=messages,
stream=False) max_completion_tokens=10,
stream=False)
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0 assert message.content is not None and len(message.content) >= 0
...@@ -275,11 +276,12 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, ...@@ -275,11 +276,12 @@ async def test_single_chat_session(client: openai.AsyncOpenAI,
}] }]
# test single completion # test single completion
chat_completion = await client.chat.completions.create(model=model_name, chat_completion = await client.chat.completions.create(
messages=messages, model=model_name,
max_tokens=10, messages=messages,
logprobs=True, max_completion_tokens=10,
top_logprobs=5) logprobs=True,
top_logprobs=5)
assert chat_completion.id is not None assert chat_completion.id is not None
assert len(chat_completion.choices) == 1 assert len(chat_completion.choices) == 1
...@@ -298,7 +300,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, ...@@ -298,7 +300,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=model_name, model=model_name,
messages=messages, messages=messages,
max_tokens=10, max_completion_tokens=10,
) )
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0 assert message.content is not None and len(message.content) >= 0
...@@ -323,7 +325,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str): ...@@ -323,7 +325,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=model_name, model=model_name,
messages=messages, messages=messages,
max_tokens=10, max_completion_tokens=10,
temperature=0.0, temperature=0.0,
) )
output = chat_completion.choices[0].message.content output = chat_completion.choices[0].message.content
...@@ -333,7 +335,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str): ...@@ -333,7 +335,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
stream = await client.chat.completions.create( stream = await client.chat.completions.create(
model=model_name, model=model_name,
messages=messages, messages=messages,
max_tokens=10, max_completion_tokens=10,
temperature=0.0, temperature=0.0,
stream=True, stream=True,
) )
...@@ -377,7 +379,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, ...@@ -377,7 +379,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
stream = await client.chat.completions.create( stream = await client.chat.completions.create(
model=model_name, model=model_name,
messages=messages, messages=messages,
max_tokens=10, max_completion_tokens=10,
temperature=0.0, temperature=0.0,
stream=True, stream=True,
stream_options={"include_usage": False}) stream_options={"include_usage": False})
...@@ -388,7 +390,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, ...@@ -388,7 +390,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
# "continuous_usage_stats": False}} # "continuous_usage_stats": False}}
stream = await client.chat.completions.create(model=model_name, stream = await client.chat.completions.create(model=model_name,
messages=messages, messages=messages,
max_tokens=10, max_completion_tokens=10,
temperature=0.0, temperature=0.0,
stream=True, stream=True,
stream_options={ stream_options={
...@@ -417,7 +419,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, ...@@ -417,7 +419,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
await client.chat.completions.create( await client.chat.completions.create(
model=model_name, model=model_name,
messages=messages, messages=messages,
max_tokens=10, max_completion_tokens=10,
temperature=0.0, temperature=0.0,
stream=False, stream=False,
stream_options={"include_usage": None}) stream_options={"include_usage": None})
...@@ -427,7 +429,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, ...@@ -427,7 +429,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
await client.chat.completions.create( await client.chat.completions.create(
model=model_name, model=model_name,
messages=messages, messages=messages,
max_tokens=10, max_completion_tokens=10,
temperature=0.0, temperature=0.0,
stream=False, stream=False,
stream_options={"include_usage": True}) stream_options={"include_usage": True})
...@@ -437,19 +439,29 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, ...@@ -437,19 +439,29 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
stream = await client.chat.completions.create( stream = await client.chat.completions.create(
model=model_name, model=model_name,
messages=messages, messages=messages,
max_tokens=10, max_completion_tokens=10,
extra_body=dict(min_tokens=10),
temperature=0.0, temperature=0.0,
stream=True, stream=True,
stream_options={ stream_options={
"include_usage": True, "include_usage": True,
"continuous_usage_stats": True "continuous_usage_stats": True,
}, },
) )
last_completion_tokens = 0
async for chunk in stream: async for chunk in stream:
assert chunk.usage.prompt_tokens >= 0 assert chunk.usage.prompt_tokens >= 0
assert chunk.usage.completion_tokens >= 0 assert last_completion_tokens == 0 or \
chunk.usage.completion_tokens > last_completion_tokens or \
(
not chunk.choices and
chunk.usage.completion_tokens == last_completion_tokens
)
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens + assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
chunk.usage.completion_tokens) chunk.usage.completion_tokens)
last_completion_tokens = chunk.usage.completion_tokens
assert last_completion_tokens == 10
# NOTE: Not sure why, but when I place this after `test_guided_regex_chat` # NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
...@@ -474,7 +486,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, ...@@ -474,7 +486,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
messages=messages, messages=messages,
max_tokens=10, max_completion_tokens=10,
extra_body=dict(guided_choice=sample_guided_choice, extra_body=dict(guided_choice=sample_guided_choice,
guided_decoding_backend=guided_decoding_backend)) guided_decoding_backend=guided_decoding_backend))
choice1 = chat_completion.choices[0].message.content choice1 = chat_completion.choices[0].message.content
...@@ -488,7 +500,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, ...@@ -488,7 +500,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
messages=messages, messages=messages,
max_tokens=10, max_completion_tokens=10,
extra_body=dict(guided_choice=sample_guided_choice, extra_body=dict(guided_choice=sample_guided_choice,
guided_decoding_backend=guided_decoding_backend)) guided_decoding_backend=guided_decoding_backend))
choice2 = chat_completion.choices[0].message.content choice2 = chat_completion.choices[0].message.content
...@@ -515,7 +527,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, ...@@ -515,7 +527,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
messages=messages, messages=messages,
max_tokens=1000, max_completion_tokens=1000,
extra_body=dict(guided_json=sample_json_schema, extra_body=dict(guided_json=sample_json_schema,
guided_decoding_backend=guided_decoding_backend)) guided_decoding_backend=guided_decoding_backend))
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
...@@ -533,7 +545,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, ...@@ -533,7 +545,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
messages=messages, messages=messages,
max_tokens=1000, max_completion_tokens=1000,
extra_body=dict(guided_json=sample_json_schema, extra_body=dict(guided_json=sample_json_schema,
guided_decoding_backend=guided_decoding_backend)) guided_decoding_backend=guided_decoding_backend))
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
...@@ -561,7 +573,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, ...@@ -561,7 +573,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
messages=messages, messages=messages,
max_tokens=20, max_completion_tokens=20,
extra_body=dict(guided_regex=sample_regex, extra_body=dict(guided_regex=sample_regex,
guided_decoding_backend=guided_decoding_backend)) guided_decoding_backend=guided_decoding_backend))
ip1 = chat_completion.choices[0].message.content ip1 = chat_completion.choices[0].message.content
...@@ -573,7 +585,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, ...@@ -573,7 +585,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
messages=messages, messages=messages,
max_tokens=20, max_completion_tokens=20,
extra_body=dict(guided_regex=sample_regex, extra_body=dict(guided_regex=sample_regex,
guided_decoding_backend=guided_decoding_backend)) guided_decoding_backend=guided_decoding_backend))
ip2 = chat_completion.choices[0].message.content ip2 = chat_completion.choices[0].message.content
...@@ -621,7 +633,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, ...@@ -621,7 +633,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
messages=messages, messages=messages,
max_tokens=10, max_completion_tokens=10,
logprobs=True, logprobs=True,
top_logprobs=5, top_logprobs=5,
extra_body=dict(guided_choice=sample_guided_choice, extra_body=dict(guided_choice=sample_guided_choice,
...@@ -658,7 +670,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, ...@@ -658,7 +670,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
messages=messages, messages=messages,
max_tokens=1000, max_completion_tokens=1000,
tools=[{ tools=[{
"type": "function", "type": "function",
"function": { "function": {
...@@ -692,7 +704,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, ...@@ -692,7 +704,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
stream = await client.chat.completions.create( stream = await client.chat.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
messages=messages, messages=messages,
max_tokens=1000, max_completion_tokens=1000,
tools=[{ tools=[{
"type": "function", "type": "function",
"function": { "function": {
...@@ -748,7 +760,7 @@ async def test_required_tool_use_not_yet_supported( ...@@ -748,7 +760,7 @@ async def test_required_tool_use_not_yet_supported(
await client.chat.completions.create( await client.chat.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
messages=messages, messages=messages,
max_tokens=1000, max_completion_tokens=1000,
tools=[{ tools=[{
"type": "function", "type": "function",
"function": { "function": {
...@@ -763,7 +775,7 @@ async def test_required_tool_use_not_yet_supported( ...@@ -763,7 +775,7 @@ async def test_required_tool_use_not_yet_supported(
await client.chat.completions.create( await client.chat.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
messages=messages, messages=messages,
max_tokens=1000, max_completion_tokens=1000,
tools=[{ tools=[{
"type": "function", "type": "function",
"function": { "function": {
...@@ -794,7 +806,7 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI, ...@@ -794,7 +806,7 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
with pytest.raises(openai.BadRequestError): with pytest.raises(openai.BadRequestError):
await client.chat.completions.create(model=MODEL_NAME, await client.chat.completions.create(model=MODEL_NAME,
messages=messages, messages=messages,
max_tokens=1000, max_completion_tokens=1000,
tool_choice={ tool_choice={
"type": "function", "type": "function",
"function": { "function": {
...@@ -807,7 +819,7 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI, ...@@ -807,7 +819,7 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
await client.chat.completions.create( await client.chat.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
messages=messages, messages=messages,
max_tokens=1000, max_completion_tokens=1000,
tools=[{ tools=[{
"type": "function", "type": "function",
"function": { "function": {
...@@ -822,6 +834,20 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI, ...@@ -822,6 +834,20 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
"name": "nondefined_function_name" "name": "nondefined_function_name"
} }
}) })
with pytest.raises(openai.BadRequestError):
await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_completion_tokens=1000,
tools=[{
"type": "function",
"function": {
"name": "dummy_function_name",
"description": "This is a dummy function",
"parameters": sample_json_schema
}
}],
tool_choice={})
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -846,14 +872,28 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI): ...@@ -846,14 +872,28 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_response_format_json_schema(client: openai.AsyncOpenAI): async def test_response_format_json_schema(client: openai.AsyncOpenAI):
prompt = 'what is 1+1? The format is "result": 2'
# Check that this prompt cannot lead to a valid JSON without json_schema
for _ in range(2): for _ in range(2):
resp = await client.chat.completions.create( resp = await client.chat.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
messages=[{ messages=[{
"role": "role": "user",
"user", "content": prompt
"content": ('what is 1+1? please respond with a JSON object, ' }],
'the format is {"result": 2}') )
content = resp.choices[0].message.content
assert content is not None
with pytest.raises((json.JSONDecodeError, AssertionError)):
loaded = json.loads(content)
assert loaded == {"result": 2}, loaded
for _ in range(2):
resp = await client.chat.completions.create(
model=MODEL_NAME,
messages=[{
"role": "user",
"content": prompt
}], }],
response_format={ response_format={
"type": "json_schema", "type": "json_schema",
...@@ -878,19 +918,19 @@ async def test_response_format_json_schema(client: openai.AsyncOpenAI): ...@@ -878,19 +918,19 @@ async def test_response_format_json_schema(client: openai.AsyncOpenAI):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_extra_fields(client: openai.AsyncOpenAI): async def test_extra_fields_allowed(client: openai.AsyncOpenAI):
with pytest.raises(BadRequestError) as exc_info: resp = await client.chat.completions.create(
await client.chat.completions.create( model=MODEL_NAME,
model=MODEL_NAME, messages=[{
messages=[{ "role": "user",
"role": "system", "content": "what is 1+1?",
"content": "You are a helpful assistant.", "extra_field": "0",
"extra_field": "0", }], # type: ignore
}], # type: ignore temperature=0,
temperature=0, seed=0)
seed=0)
content = resp.choices[0].message.content
assert "extra_forbidden" in exc_info.value.message assert content is not None
@pytest.mark.asyncio @pytest.mark.asyncio
......
from typing import NamedTuple
import os
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
from ...utils import RemoteOpenAIServer, models_path_prefix
# # any model with a chat template should work here
MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct")
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
@pytest.fixture(scope="module")
def server():
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
"--enforce-eager",
"--max-model-len",
"4080",
"--chat-template",
DUMMY_CHAT_TEMPLATE,
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
class TestCase(NamedTuple):
model_name: str
echo: bool
@pytest.mark.asyncio
@pytest.mark.parametrize(
"test_case",
[
TestCase(model_name=MODEL_NAME, echo=True),
TestCase(model_name=MODEL_NAME, echo=False)
],
)
async def test_chat_session_with_echo_and_continue_final_message(
client: openai.AsyncOpenAI, test_case: TestCase):
saying: str = "Here is a common saying about apple. An apple a day, keeps"
# test echo with continue_final_message parameter
chat_completion = await client.chat.completions.create(
model=test_case.model_name,
messages=[{
"role": "user",
"content": "tell me a common saying"
}, {
"role": "assistant",
"content": saying
}],
extra_body={
"echo": test_case.echo,
"continue_final_message": True,
"add_generation_prompt": False
})
assert chat_completion.id is not None
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "stop"
message = choice.message
if test_case.echo:
assert message.content is not None and saying in message.content
else:
assert message.content is not None and saying not in message.content
assert message.role == "assistant"
...@@ -13,7 +13,7 @@ assert chatml_jinja_path.exists() ...@@ -13,7 +13,7 @@ assert chatml_jinja_path.exists()
# Define models, templates, and their corresponding expected outputs # Define models, templates, and their corresponding expected outputs
MODEL_TEMPLATE_GENERATON_OUTPUT = [ MODEL_TEMPLATE_GENERATON_OUTPUT = [
(os.path.join(models_path_prefix, "facebook/opt-125m"), chatml_jinja_path, True, """<|im_start|>user (os.path.join(models_path_prefix, "facebook/opt-125m"), chatml_jinja_path, True, False, """<|im_start|>user
Hello<|im_end|> Hello<|im_end|>
<|im_start|>assistant <|im_start|>assistant
Hi there!<|im_end|> Hi there!<|im_end|>
...@@ -21,12 +21,20 @@ Hi there!<|im_end|> ...@@ -21,12 +21,20 @@ Hi there!<|im_end|>
What is the capital of<|im_end|> What is the capital of<|im_end|>
<|im_start|>assistant <|im_start|>assistant
"""), """),
(os.path.join(models_path_prefix, "facebook/opt-125m"), chatml_jinja_path, False, """<|im_start|>user (os.path.join(models_path_prefix, "facebook/opt-125m"), chatml_jinja_path, False, False, """<|im_start|>user
Hello<|im_end|> Hello<|im_end|>
<|im_start|>assistant <|im_start|>assistant
Hi there!<|im_end|> Hi there!<|im_end|>
<|im_start|>user <|im_start|>user
What is the capital of""") What is the capital of"""),
(os.path.join(models_path_prefix, "facebook/opt-125m"), chatml_jinja_path, False, True, """<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
<|im_start|>user
What is the capital of<|im_end|>
<|im_start|>assistant
The capital of"""),
] ]
TEST_MESSAGES = [ TEST_MESSAGES = [
...@@ -43,6 +51,10 @@ TEST_MESSAGES = [ ...@@ -43,6 +51,10 @@ TEST_MESSAGES = [
'content': 'What is the capital of' 'content': 'What is the capital of'
}, },
] ]
ASSISTANT_MESSAGE_TO_CONTINUE = {
'role': 'assistant',
'content': 'The capital of'
}
def test_load_chat_template(): def test_load_chat_template():
...@@ -74,10 +86,10 @@ def test_no_load_chat_template_literallike(): ...@@ -74,10 +86,10 @@ def test_no_load_chat_template_literallike():
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model,template,add_generation_prompt,expected_output", "model,template,add_generation_prompt,continue_final_message,expected_output",
MODEL_TEMPLATE_GENERATON_OUTPUT) MODEL_TEMPLATE_GENERATON_OUTPUT)
def test_get_gen_prompt(model, template, add_generation_prompt, def test_get_gen_prompt(model, template, add_generation_prompt,
expected_output): continue_final_message, expected_output):
# Initialize the tokenizer # Initialize the tokenizer
tokenizer = get_tokenizer(tokenizer_name=model) tokenizer = get_tokenizer(tokenizer_name=model)
template_content = load_chat_template(chat_template=template) template_content = load_chat_template(chat_template=template)
...@@ -85,8 +97,11 @@ def test_get_gen_prompt(model, template, add_generation_prompt, ...@@ -85,8 +97,11 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
# Create a mock request object using keyword arguments # Create a mock request object using keyword arguments
mock_request = ChatCompletionRequest( mock_request = ChatCompletionRequest(
model=model, model=model,
messages=TEST_MESSAGES, messages=TEST_MESSAGES + [ASSISTANT_MESSAGE_TO_CONTINUE]
add_generation_prompt=add_generation_prompt) if continue_final_message else TEST_MESSAGES,
add_generation_prompt=add_generation_prompt,
continue_final_message=continue_final_message,
)
# Call the function and get the result # Call the function and get the result
result = apply_hf_chat_template( result = apply_hf_chat_template(
...@@ -94,6 +109,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt, ...@@ -94,6 +109,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
conversation=mock_request.messages, conversation=mock_request.messages,
chat_template=mock_request.chat_template or template_content, chat_template=mock_request.chat_template or template_content,
add_generation_prompt=mock_request.add_generation_prompt, add_generation_prompt=mock_request.add_generation_prompt,
continue_final_message=mock_request.continue_final_message,
) )
# Test assertion # Test assertion
......
import os
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
from ...utils import RemoteOpenAIServer, models_path_prefix
# any model with a chat template should work here
MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
@pytest.fixture(scope="module")
def server():
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
# lora config below
"--max-num-seqs",
"128",
"--enable-chunked-prefill",
"--max-num-batched-tokens",
"1000",
# large prompts create a lot of output
"--disable-log-requests",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
async def test_completion_stream_options_and_logprobs_with_long_prompts(
client: openai.AsyncOpenAI):
# Test stream with long prompt
prompt = "What is the capital of France?" * 400
stream = await client.completions.create(
model=MODEL_NAME,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={
"include_usage": True,
"continuous_usage_stats": True,
},
logprobs=5,
)
tokens_received = 0
finished = False
async for chunk in stream:
assert chunk.usage.prompt_tokens >= 0
assert chunk.usage.completion_tokens >= 0
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
chunk.usage.completion_tokens)
if not finished:
tokens_received += 1
assert chunk.choices[0].text
if chunk.choices[0].finish_reason is not None:
finished = True
if finished:
assert chunk.usage.completion_tokens == tokens_received
@pytest.mark.asyncio
async def test_chat_completion_stream_options_and_logprobs_with_long_prompts(
client: openai.AsyncOpenAI):
# Test stream with long prompt
messages = [{
"role": "system",
"content": "You are a helpful assistant."
}, {
"role": "user",
"content": "What is the capital of France?" * 400
}]
stream = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={
"include_usage": True,
"continuous_usage_stats": True,
},
logprobs=True,
top_logprobs=5,
)
tokens_received = 0
empty_chunks_received = 0
finished = False
async for chunk in stream:
assert chunk.usage.prompt_tokens >= 0
assert chunk.usage.completion_tokens >= 0
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
chunk.usage.completion_tokens)
if not finished:
if chunk.choices[0].delta.content == "":
# when there is no tokens generated
assert chunk.usage.completion_tokens == 0
assert chunk.choices[0].logprobs is None
empty_chunks_received += 1
else:
tokens_received += 1
if chunk.choices[0].finish_reason is not None:
finished = True
if finished:
assert chunk.usage.completion_tokens == tokens_received
assert empty_chunks_received <= 1
import json import json
import unittest
from vllm.entrypoints.openai.cli_args import make_arg_parser import pytest
from vllm.entrypoints.openai.cli_args import (make_arg_parser,
validate_parsed_serve_args)
from vllm.entrypoints.openai.serving_engine import LoRAModulePath from vllm.entrypoints.openai.serving_engine import LoRAModulePath
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
from ...utils import VLLM_PATH
LORA_MODULE = { LORA_MODULE = {
"name": "module2", "name": "module2",
"path": "/path/to/module2", "path": "/path/to/module2",
"base_model_name": "llama" "base_model_name": "llama"
} }
CHATML_JINJA_PATH = VLLM_PATH / "examples/template_chatml.jinja"
assert CHATML_JINJA_PATH.exists()
class TestLoraParserAction(unittest.TestCase): @pytest.fixture
def serve_parser():
parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
return make_arg_parser(parser)
def setUp(self):
# Setting up argparse parser for tests
parser = FlexibleArgumentParser(
description="vLLM's remote OpenAI server.")
self.parser = make_arg_parser(parser)
def test_valid_key_value_format(self): ### Tests for Lora module parsing
# Test old format: name=path def test_valid_key_value_format(serve_parser):
args = self.parser.parse_args([ # Test old format: name=path
'--lora-modules', args = serve_parser.parse_args([
'module1=/path/to/module1', '--lora-modules',
'module1=/path/to/module1',
])
expected = [LoRAModulePath(name='module1', path='/path/to/module1')]
assert args.lora_modules == expected
def test_valid_json_format(serve_parser):
# Test valid JSON format input
args = serve_parser.parse_args([
'--lora-modules',
json.dumps(LORA_MODULE),
])
expected = [
LoRAModulePath(name='module2',
path='/path/to/module2',
base_model_name='llama')
]
assert args.lora_modules == expected
def test_invalid_json_format(serve_parser):
# Test invalid JSON format input, missing closing brace
with pytest.raises(SystemExit):
serve_parser.parse_args([
'--lora-modules', '{"name": "module3", "path": "/path/to/module3"'
]) ])
expected = [LoRAModulePath(name='module1', path='/path/to/module1')]
self.assertEqual(args.lora_modules, expected)
def test_valid_json_format(self):
# Test valid JSON format input def test_invalid_type_error(serve_parser):
args = self.parser.parse_args([ # Test type error when values are not JSON or key=value
with pytest.raises(SystemExit):
serve_parser.parse_args([
'--lora-modules', '--lora-modules',
json.dumps(LORA_MODULE), 'invalid_format' # This is not JSON or key=value format
]) ])
expected = [
LoRAModulePath(name='module2',
path='/path/to/module2', def test_invalid_json_field(serve_parser):
base_model_name='llama') # Test valid JSON format but missing required fields
] with pytest.raises(SystemExit):
self.assertEqual(args.lora_modules, expected) serve_parser.parse_args([
def test_invalid_json_format(self):
# Test invalid JSON format input, missing closing brace
with self.assertRaises(SystemExit):
self.parser.parse_args([
'--lora-modules',
'{"name": "module3", "path": "/path/to/module3"'
])
def test_invalid_type_error(self):
# Test type error when values are not JSON or key=value
with self.assertRaises(SystemExit):
self.parser.parse_args([
'--lora-modules',
'invalid_format' # This is not JSON or key=value format
])
def test_invalid_json_field(self):
# Test valid JSON format but missing required fields
with self.assertRaises(SystemExit):
self.parser.parse_args([
'--lora-modules',
'{"name": "module4"}' # Missing required 'path' field
])
def test_empty_values(self):
# Test when no LoRA modules are provided
args = self.parser.parse_args(['--lora-modules', ''])
self.assertEqual(args.lora_modules, [])
def test_multiple_valid_inputs(self):
# Test multiple valid inputs (both old and JSON format)
args = self.parser.parse_args([
'--lora-modules', '--lora-modules',
'module1=/path/to/module1', '{"name": "module4"}' # Missing required 'path' field
json.dumps(LORA_MODULE),
]) ])
expected = [
LoRAModulePath(name='module1', path='/path/to/module1'),
LoRAModulePath(name='module2',
path='/path/to/module2',
base_model_name='llama')
]
self.assertEqual(args.lora_modules, expected)
if __name__ == '__main__': def test_empty_values(serve_parser):
unittest.main() # Test when no LoRA modules are provided
args = serve_parser.parse_args(['--lora-modules', ''])
assert args.lora_modules == []
def test_multiple_valid_inputs(serve_parser):
# Test multiple valid inputs (both old and JSON format)
args = serve_parser.parse_args([
'--lora-modules',
'module1=/path/to/module1',
json.dumps(LORA_MODULE),
])
expected = [
LoRAModulePath(name='module1', path='/path/to/module1'),
LoRAModulePath(name='module2',
path='/path/to/module2',
base_model_name='llama')
]
assert args.lora_modules == expected
### Tests for serve argument validation that run prior to loading
def test_enable_auto_choice_passes_without_tool_call_parser(serve_parser):
"""Ensure validation fails if tool choice is enabled with no call parser"""
# If we enable-auto-tool-choice, explode with no tool-call-parser
args = serve_parser.parse_args(args=["--enable-auto-tool-choice"])
with pytest.raises(TypeError):
validate_parsed_serve_args(args)
def test_enable_auto_choice_passes_with_tool_call_parser(serve_parser):
"""Ensure validation passes with tool choice enabled with a call parser"""
args = serve_parser.parse_args(args=[
"--enable-auto-tool-choice",
"--tool-call-parser",
"mistral",
])
validate_parsed_serve_args(args)
def test_chat_template_validation_for_happy_paths(serve_parser):
"""Ensure validation passes if the chat template exists"""
args = serve_parser.parse_args(
args=["--chat-template",
CHATML_JINJA_PATH.absolute().as_posix()])
validate_parsed_serve_args(args)
def test_chat_template_validation_for_sad_paths(serve_parser):
"""Ensure validation fails if the chat template doesn't exist"""
args = serve_parser.parse_args(args=["--chat-template", "does/not/exist"])
with pytest.raises(ValueError):
validate_parsed_serve_args(args)
...@@ -160,15 +160,15 @@ async def test_added_lora_tokens(client: openai.AsyncOpenAI): ...@@ -160,15 +160,15 @@ async def test_added_lora_tokens(client: openai.AsyncOpenAI):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI): async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
# test using token IDs # test using token IDs
completion = await client.completions.create( with pytest.raises(openai.BadRequestError, match="out of vocabulary"):
model=MODEL_NAME, # Added tokens should be rejected by the base model
prompt=[0, 0, 32000, 32001, 32002], await client.completions.create(
echo=True, model=MODEL_NAME,
max_tokens=5, prompt=[0, 0, 32000, 32001, 32002],
temperature=0.0, echo=True,
) max_tokens=5,
# Added tokens should not appear in tokenized prompt temperature=0.0,
assert "vllm" not in completion.choices[0].text )
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -343,6 +343,40 @@ async def test_completion_streaming(client: openai.AsyncOpenAI, ...@@ -343,6 +343,40 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
assert "".join(chunks) == single_output assert "".join(chunks) == single_output
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME, "zephyr-lora", "zephyr-pa"],
)
async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
"""Streaming for parallel sampling.
The tokens from multiple samples, are flattened into a single stream,
with an index to indicate which sample the token belongs to.
"""
prompt = "What is an LLM?"
n = 3
max_tokens = 5
stream = await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=max_tokens,
n=n,
stream=True)
chunks: List[List[str]] = [[] for i in range(n)]
finish_reason_count = 0
async for chunk in stream:
index = chunk.choices[0].index
text = chunk.choices[0].text
chunks[index].append(text)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
assert finish_reason_count == n
for chunk in chunks:
assert len(chunk) == max_tokens
print("".join(chunk))
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model_name", "model_name",
...@@ -506,8 +540,8 @@ async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str): ...@@ -506,8 +540,8 @@ async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
max_tokens=5, max_tokens=5,
temperature=0.0, temperature=0.0,
extra_body=dict( extra_body=dict(
# NOTE: this has to be true for n > 1 in vLLM, but not necessary # NOTE: this has to be true for n > 1 in vLLM, but
# for official client. # not necessary for official client.
use_beam_search=True), use_beam_search=True),
) )
assert len(batch.choices) == 4 assert len(batch.choices) == 4
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment