Commit 469e903b authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.2' into v0.8.2-dev

parents 389ebcf7 25f560a6
...@@ -5,7 +5,6 @@ import os ...@@ -5,7 +5,6 @@ import os
from vllm import LLM from vllm import LLM
from ...utils import models_path_prefix from ...utils import models_path_prefix
from vllm.config import LoadFormat
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
...@@ -17,17 +16,13 @@ def v1(run_with_both_engines): ...@@ -17,17 +16,13 @@ def v1(run_with_both_engines):
def test_empty_prompt(): def test_empty_prompt():
llm = LLM(model=os.path.join(models_path_prefix, "gpt2"), llm = LLM(model=os.path.join(models_path_prefix, "openai-community/gpt2"), enforce_eager=True)
load_format=LoadFormat.RUNAI_STREAMER,
enforce_eager=True)
with pytest.raises(ValueError, match='Prompt cannot be empty'): with pytest.raises(ValueError, match='Prompt cannot be empty'):
llm.generate([""]) llm.generate([""])
@pytest.mark.skip_v1 @pytest.mark.skip_v1
def test_out_of_vocab_token(): def test_out_of_vocab_token():
llm = LLM(model=os.path.join(models_path_prefix, "gpt2"), llm = LLM(model=os.path.join(models_path_prefix, "openai-community/gpt2"), enforce_eager=True)
load_format=LoadFormat.RUNAI_STREAMER,
enforce_eager=True)
with pytest.raises(ValueError, match='out of vocabulary'): with pytest.raises(ValueError, match='out of vocabulary'):
llm.generate({"prompt_token_ids": [999999]}) llm.generate({"prompt_token_ids": [999999]})
...@@ -56,32 +56,37 @@ def cache_models(): ...@@ -56,32 +56,37 @@ def cache_models():
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
@pytest.mark.usefixtures("cache_models") @pytest.mark.usefixtures("cache_models")
def test_offline_mode(monkeypatch): def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
# Set HF to offline mode and ensure we can still construct an LLM # Set HF to offline mode and ensure we can still construct an LLM
try: with monkeypatch.context() as m:
monkeypatch.setenv("HF_HUB_OFFLINE", "1") try:
monkeypatch.setenv("VLLM_NO_USAGE_STATS", "1") m.setenv("HF_HUB_OFFLINE", "1")
m.setenv("VLLM_NO_USAGE_STATS", "1")
def disable_connect(*args, **kwargs):
raise RuntimeError("No http calls allowed") def disable_connect(*args, **kwargs):
raise RuntimeError("No http calls allowed")
monkeypatch.setattr(urllib3.connection.HTTPConnection, "connect",
disable_connect) m.setattr(
monkeypatch.setattr(urllib3.connection.HTTPSConnection, "connect", urllib3.connection.HTTPConnection,
disable_connect) "connect",
disable_connect,
# Need to re-import huggingface_hub and friends to setup offline mode )
_re_import_modules() m.setattr(
# Cached model files should be used in offline mode urllib3.connection.HTTPSConnection,
for model_config in MODEL_CONFIGS: "connect",
LLM(**model_config) disable_connect,
finally: )
# Reset the environment after the test
# NB: Assuming tests are run in online mode # Need to re-import huggingface_hub
monkeypatch.delenv("HF_HUB_OFFLINE") # and friends to setup offline mode
monkeypatch.delenv("VLLM_NO_USAGE_STATS") _re_import_modules()
_re_import_modules() # Cached model files should be used in offline mode
pass for model_config in MODEL_CONFIGS:
LLM(**model_config)
finally:
# Reset the environment after the test
# NB: Assuming tests are run in online mode
_re_import_modules()
def _re_import_modules(): def _re_import_modules():
......
...@@ -21,7 +21,7 @@ NUM_CONCURRENT = 500 ...@@ -21,7 +21,7 @@ NUM_CONCURRENT = 500
TASK = "gsm8k" TASK = "gsm8k"
FILTER = "exact_match,strict-match" FILTER = "exact_match,strict-match"
RTOL = 0.03 RTOL = 0.03
EXPECTED_VALUE = 0.58 EXPECTED_VALUE = 0.54
DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"] DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"]
MORE_ARGS_LIST = [ MORE_ARGS_LIST = [
[], # Default [], # Default
...@@ -71,7 +71,7 @@ def run_test(more_args): ...@@ -71,7 +71,7 @@ def run_test(more_args):
@pytest.mark.skipif(not current_platform.is_cuda() @pytest.mark.skipif(not current_platform.is_cuda()
and not current_platform.is_tpu(), and not current_platform.is_tpu(),
reason="V1 currently only supported on CUDA and TPU") reason="V1 currently only supported on CUDA and TPU")
def test_lm_eval_accuracy_v1_engine(monkeypatch): def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
"""Run with the V1 Engine.""" """Run with the V1 Engine."""
with monkeypatch.context() as m: with monkeypatch.context() as m:
...@@ -86,7 +86,8 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch): ...@@ -86,7 +86,8 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
@pytest.mark.parametrize("more_args", MORE_ARGS_LIST) @pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args): def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch,
more_args):
"""Run with the V0 Engine.""" """Run with the V0 Engine."""
with monkeypatch.context() as m: with monkeypatch.context() as m:
......
...@@ -10,7 +10,6 @@ import asyncio ...@@ -10,7 +10,6 @@ import asyncio
import io import io
import time import time
from statistics import mean, median from statistics import mean, median
from typing import List
import librosa import librosa
import pytest import pytest
...@@ -67,7 +66,7 @@ async def process_dataset(model, client, data, concurrent_request): ...@@ -67,7 +66,7 @@ async def process_dataset(model, client, data, concurrent_request):
audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"] audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"]
_ = await bound_transcribe(model, sem, client, (audio, sr), "") _ = await bound_transcribe(model, sem, client, (audio, sr), "")
tasks: List[asyncio.Task] = [] tasks: list[asyncio.Task] = []
for sample in data: for sample in data:
audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"] audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
task = asyncio.create_task( task = asyncio.create_task(
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from typing import List
import pytest import pytest
from transformers import AutoTokenizer from transformers import AutoTokenizer
...@@ -180,7 +178,7 @@ def test_reasoning( ...@@ -180,7 +178,7 @@ def test_reasoning(
): ):
output = tokenizer.tokenize(param_dict["output"]) output = tokenizer.tokenize(param_dict["output"])
# decode everything to tokens # decode everything to tokens
output_tokens: List[str] = [ output_tokens: list[str] = [
tokenizer.convert_tokens_to_string([token]) for token in output tokenizer.convert_tokens_to_string([token]) for token in output
] ]
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser( parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from typing import List, Optional, Tuple, Union from typing import Optional, Union
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
DeltaMessage) DeltaMessage)
...@@ -33,10 +33,10 @@ class StreamingReasoningReconstructor: ...@@ -33,10 +33,10 @@ class StreamingReasoningReconstructor:
def run_reasoning_extraction( def run_reasoning_extraction(
reasoning_parser: ReasoningParser, reasoning_parser: ReasoningParser,
model_output: List[str], model_output: list[str],
request: Union[ChatCompletionRequest, None] = None, request: Union[ChatCompletionRequest, None] = None,
streaming: bool = False, streaming: bool = False,
) -> Tuple[Optional[str], Optional[str]]: ) -> tuple[Optional[str], Optional[str]]:
if streaming: if streaming:
reconstructor = run_reasoning_extraction_streaming( reconstructor = run_reasoning_extraction_streaming(
reasoning_parser, reasoning_parser,
...@@ -55,9 +55,9 @@ def run_reasoning_extraction( ...@@ -55,9 +55,9 @@ def run_reasoning_extraction(
def run_reasoning_extraction_nonstreaming( def run_reasoning_extraction_nonstreaming(
reasoning_parser: ReasoningParser, reasoning_parser: ReasoningParser,
model_output: List[str], model_output: list[str],
request: Union[ChatCompletionRequest, None] = None, request: Union[ChatCompletionRequest, None] = None,
) -> Tuple[Optional[str], Optional[str]]: ) -> tuple[Optional[str], Optional[str]]:
request = request or ChatCompletionRequest(messages=[], model="test-model") request = request or ChatCompletionRequest(messages=[], model="test-model")
return reasoning_parser.extract_reasoning_content( return reasoning_parser.extract_reasoning_content(
model_output=''.join(model_output), request=request) model_output=''.join(model_output), request=request)
...@@ -65,13 +65,13 @@ def run_reasoning_extraction_nonstreaming( ...@@ -65,13 +65,13 @@ def run_reasoning_extraction_nonstreaming(
def run_reasoning_extraction_streaming( def run_reasoning_extraction_streaming(
reasoning_parser: ReasoningParser, reasoning_parser: ReasoningParser,
model_deltas: List[str], model_deltas: list[str],
request: Union[ChatCompletionRequest, None] = None, request: Union[ChatCompletionRequest, None] = None,
) -> StreamingReasoningReconstructor: ) -> StreamingReasoningReconstructor:
request = request or ChatCompletionRequest(messages=[], model="test-model") request = request or ChatCompletionRequest(messages=[], model="test-model")
reconstructor = StreamingReasoningReconstructor() reconstructor = StreamingReasoningReconstructor()
previous_text = "" previous_text = ""
previous_tokens: List[int] = [] previous_tokens: list[int] = []
for delta in model_deltas: for delta in model_deltas:
token_delta = [ token_delta = [
reasoning_parser.vocab.get(token) reasoning_parser.vocab.get(token)
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from typing import Dict, List
import openai import openai
import pytest import pytest
import os import os
...@@ -20,8 +18,6 @@ TEST_AUDIO_URLS = [ ...@@ -20,8 +18,6 @@ TEST_AUDIO_URLS = [
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
args = [ args = [
"--dtype",
"bfloat16",
"--max-model-len", "--max-model-len",
"2048", "2048",
"--max-num-seqs", "--max-num-seqs",
...@@ -41,7 +37,7 @@ async def client(server): ...@@ -41,7 +37,7 @@ async def client(server):
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def base64_encoded_audio() -> Dict[str, str]: def base64_encoded_audio() -> dict[str, str]:
return { return {
audio_url: encode_audio_base64(*fetch_audio(audio_url)) audio_url: encode_audio_base64(*fetch_audio(audio_url))
for audio_url in TEST_AUDIO_URLS for audio_url in TEST_AUDIO_URLS
...@@ -83,7 +79,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI, ...@@ -83,7 +79,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
choice = chat_completion.choices[0] choice = chat_completion.choices[0]
assert choice.finish_reason == "length" assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage( assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=201, total_tokens=211) completion_tokens=10, prompt_tokens=202, total_tokens=212)
message = choice.message message = choice.message
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
...@@ -107,7 +103,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI, ...@@ -107,7 +103,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
async def test_single_chat_session_audio_base64encoded( async def test_single_chat_session_audio_base64encoded(
client: openai.AsyncOpenAI, model_name: str, audio_url: str, client: openai.AsyncOpenAI, model_name: str, audio_url: str,
base64_encoded_audio: Dict[str, str]): base64_encoded_audio: dict[str, str]):
messages = [{ messages = [{
"role": "role":
...@@ -140,7 +136,7 @@ async def test_single_chat_session_audio_base64encoded( ...@@ -140,7 +136,7 @@ async def test_single_chat_session_audio_base64encoded(
choice = chat_completion.choices[0] choice = chat_completion.choices[0]
assert choice.finish_reason == "length" assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage( assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=201, total_tokens=211) completion_tokens=10, prompt_tokens=202, total_tokens=212)
message = choice.message message = choice.message
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
...@@ -165,7 +161,7 @@ async def test_single_chat_session_audio_base64encoded( ...@@ -165,7 +161,7 @@ async def test_single_chat_session_audio_base64encoded(
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
async def test_single_chat_session_input_audio( async def test_single_chat_session_input_audio(
client: openai.AsyncOpenAI, model_name: str, audio_url: str, client: openai.AsyncOpenAI, model_name: str, audio_url: str,
base64_encoded_audio: Dict[str, str]): base64_encoded_audio: dict[str, str]):
messages = [{ messages = [{
"role": "role":
"user", "user",
...@@ -196,7 +192,7 @@ async def test_single_chat_session_input_audio( ...@@ -196,7 +192,7 @@ async def test_single_chat_session_input_audio(
choice = chat_completion.choices[0] choice = chat_completion.choices[0]
assert choice.finish_reason == "length" assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage( assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=201, total_tokens=211) completion_tokens=10, prompt_tokens=202, total_tokens=212)
message = choice.message message = choice.message
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
...@@ -255,7 +251,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI, ...@@ -255,7 +251,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
temperature=0.0, temperature=0.0,
stream=True, stream=True,
) )
chunks: List[str] = [] chunks: list[str] = []
finish_reason_count = 0 finish_reason_count = 0
async for chunk in stream: async for chunk in stream:
delta = chunk.choices[0].delta delta = chunk.choices[0].delta
...@@ -277,7 +273,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI, ...@@ -277,7 +273,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI, async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
model_name: str, audio_url: str, model_name: str, audio_url: str,
base64_encoded_audio: Dict[str, base64_encoded_audio: dict[str,
str]): str]):
messages = [{ messages = [{
"role": "role":
...@@ -315,7 +311,7 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI, ...@@ -315,7 +311,7 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
temperature=0.0, temperature=0.0,
stream=True, stream=True,
) )
chunks: List[str] = [] chunks: list[str] = []
finish_reason_count = 0 finish_reason_count = 0
async for chunk in stream: async for chunk in stream:
delta = chunk.choices[0].delta delta = chunk.choices[0].delta
...@@ -337,7 +333,7 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI, ...@@ -337,7 +333,7 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str, async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
audio_url: str, audio_url: str,
base64_encoded_audio: Dict[str, str]): base64_encoded_audio: dict[str, str]):
messages = [{ messages = [{
"role": "role":
......
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
import asyncio import asyncio
from http import HTTPStatus from http import HTTPStatus
from typing import List
import openai import openai
import pytest import pytest
...@@ -18,7 +17,7 @@ MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta") ...@@ -18,7 +17,7 @@ MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
@pytest.fixture(scope='module') @pytest.fixture(scope='module')
def server_args(request: pytest.FixtureRequest) -> List[str]: def server_args(request: pytest.FixtureRequest) -> list[str]:
""" Provide extra arguments to the server via indirect parametrization """ Provide extra arguments to the server via indirect parametrization
Usage: Usage:
...@@ -173,3 +172,51 @@ async def test_request_wrong_content_type(server: RemoteOpenAIServer): ...@@ -173,3 +172,51 @@ async def test_request_wrong_content_type(server: RemoteOpenAIServer):
extra_headers={ extra_headers={
"Content-Type": "application/x-www-form-urlencoded" "Content-Type": "application/x-www-form-urlencoded"
}) })
@pytest.mark.parametrize(
"server_args",
[
pytest.param(["--enable-server-load-tracking"],
id="enable-server-load-tracking")
],
indirect=True,
)
@pytest.mark.asyncio
async def test_server_load(server: RemoteOpenAIServer):
# Check initial server load
response = requests.get(server.url_for("load"))
assert response.status_code == HTTPStatus.OK
assert response.json().get("server_load") == 0
def make_long_completion_request():
return requests.post(
server.url_for("v1/completions"),
headers={"Content-Type": "application/json"},
json={
"prompt": "Give me a long story",
"max_tokens": 1000,
"temperature": 0,
},
)
# Start the completion request in a background thread.
completion_future = asyncio.create_task(
asyncio.to_thread(make_long_completion_request))
# Give a short delay to ensure the request has started.
await asyncio.sleep(0.1)
# Check server load while the completion request is running.
response = requests.get(server.url_for("load"))
assert response.status_code == HTTPStatus.OK
assert response.json().get("server_load") == 1
# Wait for the completion request to finish.
await completion_future
await asyncio.sleep(0.1)
# Check server load after the completion request has finished.
response = requests.get(server.url_for("load"))
assert response.status_code == HTTPStatus.OK
assert response.json().get("server_load") == 0
...@@ -3,13 +3,14 @@ ...@@ -3,13 +3,14 @@
# imports for guided decoding tests # imports for guided decoding tests
import json import json
import re import re
from typing import Dict, List, Optional from typing import Optional
import jsonschema import jsonschema
import openai # use the official client for correctness check import openai # use the official client for correctness check
import pytest import pytest
import os import os
import pytest_asyncio import pytest_asyncio
import requests
import torch import torch
from openai import BadRequestError from openai import BadRequestError
...@@ -190,7 +191,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, ...@@ -190,7 +191,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
async def test_prompt_logprobs_chat(client: openai.AsyncOpenAI, async def test_prompt_logprobs_chat(client: openai.AsyncOpenAI,
model_name: str, model_name: str,
prompt_logprobs: Optional[int]): prompt_logprobs: Optional[int]):
params: Dict = { params: dict = {
"messages": [{ "messages": [{
"role": "system", "role": "system",
"content": "You are a helpful assistant." "content": "You are a helpful assistant."
...@@ -232,7 +233,7 @@ async def test_prompt_logprobs_chat(client: openai.AsyncOpenAI, ...@@ -232,7 +233,7 @@ async def test_prompt_logprobs_chat(client: openai.AsyncOpenAI,
) )
async def test_more_than_one_prompt_logprobs_chat(client: openai.AsyncOpenAI, async def test_more_than_one_prompt_logprobs_chat(client: openai.AsyncOpenAI,
model_name: str): model_name: str):
params: Dict = { params: dict = {
"messages": [{ "messages": [{
"role": "system", "role": "system",
"content": "You are a helpful assistant." "content": "You are a helpful assistant."
...@@ -343,7 +344,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str): ...@@ -343,7 +344,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
temperature=0.0, temperature=0.0,
stream=True, stream=True,
) )
chunks: List[str] = [] chunks: list[str] = []
finish_reason_count = 0 finish_reason_count = 0
async for chunk in stream: async for chunk in stream:
delta = chunk.choices[0].delta delta = chunk.choices[0].delta
...@@ -1001,3 +1002,34 @@ async def test_long_seed(client: openai.AsyncOpenAI): ...@@ -1001,3 +1002,34 @@ async def test_long_seed(client: openai.AsyncOpenAI):
assert ("greater_than_equal" in exc_info.value.message assert ("greater_than_equal" in exc_info.value.message
or "less_than_equal" in exc_info.value.message) or "less_than_equal" in exc_info.value.message)
@pytest.mark.asyncio
async def test_http_chat_wo_model_name(server: RemoteOpenAIServer):
url = f"http://localhost:{server.port}/v1/chat/completions"
headers = {
"Content-Type": "application/json",
}
data = {
# model_name is avoided here.
"messages": [{
"role": "system",
"content": "You are a helpful assistant."
}, {
"role": "user",
"content": "what is 1+1?"
}],
"max_tokens":
5
}
response = requests.post(url, headers=headers, json=data)
response_data = response.json()
print(response_data)
choice = response_data.get("choices")[0]
message = choice.get("message")
assert message is not None
content = message.get("content")
assert content is not None
assert len(content) > 0
...@@ -11,7 +11,6 @@ from ...utils import RemoteOpenAIServer, models_path_prefix ...@@ -11,7 +11,6 @@ from ...utils import RemoteOpenAIServer, models_path_prefix
# # any model with a chat template should work here # # any model with a chat template should work here
MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct") MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct")
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
...@@ -23,8 +22,6 @@ def server(): ...@@ -23,8 +22,6 @@ def server():
"--enforce-eager", "--enforce-eager",
"--max-model-len", "--max-model-len",
"4080", "4080",
"--chat-template",
DUMMY_CHAT_TEMPLATE,
] ]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
......
...@@ -108,8 +108,10 @@ def test_get_gen_prompt(model, template, add_generation_prompt, ...@@ -108,8 +108,10 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
# Call the function and get the result # Call the function and get the result
result = apply_hf_chat_template( result = apply_hf_chat_template(
tokenizer, tokenizer,
trust_remote_code=True,
conversation=mock_request.messages, conversation=mock_request.messages,
chat_template=mock_request.chat_template or template_content, chat_template=mock_request.chat_template or template_content,
tools=None,
add_generation_prompt=mock_request.add_generation_prompt, add_generation_prompt=mock_request.add_generation_prompt,
continue_final_message=mock_request.continue_final_message, continue_final_message=mock_request.continue_final_message,
) )
......
# SPDX-License-Identifier: Apache-2.0
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
from ...utils import RemoteOpenAIServer
# a reasoning and tool calling model
MODEL_NAME = "Qwen/QwQ-32B"
@pytest.fixture(scope="module")
def server(): # noqa: F811
args = [
"--max-model-len", "8192", "--enforce-eager", "--enable-reasoning",
"--reasoning-parser", "deepseek_r1", "--enable-auto-tool-choice",
"--tool-call-parser", "hermes"
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
TOOLS = [{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type":
"string",
"description":
"The city to find the weather for, e.g. 'San Francisco'"
},
"state": {
"type":
"string",
"description":
"the two-letter abbreviation for the state that the city is"
" in, e.g. 'CA' which would mean 'California'"
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"]
}
},
"required": ["city", "state", "unit"]
}
}
}]
MESSAGES = [{
"role": "user",
"content": "Hi! How are you doing today?"
}, {
"role": "assistant",
"content": "I'm doing well! How can I help you?"
}, {
"role":
"user",
"content":
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
}]
FUNC_NAME = "get_current_weather"
FUNC_ARGS = """{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}"""
def extract_reasoning_and_calls(chunks: list):
reasoning_content = ""
tool_call_idx = -1
arguments = []
function_names = []
for chunk in chunks:
if chunk.choices[0].delta.tool_calls:
tool_call = chunk.choices[0].delta.tool_calls[0]
if tool_call.index != tool_call_idx:
tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
arguments.append("")
function_names.append("")
if tool_call.function:
if tool_call.function.name:
function_names[tool_call_idx] = tool_call.function.name
if tool_call.function.arguments:
arguments[tool_call_idx] += tool_call.function.arguments
else:
if hasattr(chunk.choices[0].delta, "reasoning_content"):
reasoning_content += chunk.choices[0].delta.reasoning_content
return reasoning_content, arguments, function_names
# test streaming
@pytest.mark.asyncio
async def test_chat_streaming_of_tool_and_reasoning(
client: openai.AsyncOpenAI):
stream = await client.chat.completions.create(
model=MODEL_NAME,
messages=MESSAGES,
tools=TOOLS,
temperature=0.0,
stream=True,
)
chunks = []
async for chunk in stream:
chunks.append(chunk)
reasoning_content, arguments, function_names = extract_reasoning_and_calls(
chunks)
assert len(reasoning_content) > 0
assert len(function_names) > 0 and function_names[0] == FUNC_NAME
assert len(arguments) > 0 and arguments[0] == FUNC_ARGS
# test full generate
@pytest.mark.asyncio
async def test_chat_full_of_tool_and_reasoning(client: openai.AsyncOpenAI):
tool_calls = await client.chat.completions.create(
model=MODEL_NAME,
messages=MESSAGES,
tools=TOOLS,
temperature=0.0,
stream=False,
)
assert len(tool_calls.choices[0].message.reasoning_content) > 0
assert tool_calls.choices[0].message.tool_calls[0].function.name \
== FUNC_NAME
assert tool_calls.choices[0].message.tool_calls[0].function.arguments \
== FUNC_ARGS
...@@ -26,7 +26,7 @@ def serve_parser(): ...@@ -26,7 +26,7 @@ def serve_parser():
return make_arg_parser(parser) return make_arg_parser(parser)
### Tests for Lora module parsing ### Tests for LoRA module parsing
def test_valid_key_value_format(serve_parser): def test_valid_key_value_format(serve_parser):
# Test old format: name=path # Test old format: name=path
args = serve_parser.parse_args([ args = serve_parser.parse_args([
......
...@@ -5,7 +5,7 @@ import json ...@@ -5,7 +5,7 @@ import json
import re import re
import shutil import shutil
from tempfile import TemporaryDirectory from tempfile import TemporaryDirectory
from typing import Dict, List, Optional from typing import Optional
import jsonschema import jsonschema
import openai # use the official client for correctness check import openai # use the official client for correctness check
...@@ -290,7 +290,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI, ...@@ -290,7 +290,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI, async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
model_name: str, model_name: str,
prompt_logprobs: Optional[int]): prompt_logprobs: Optional[int]):
params: Dict = { params: dict = {
"prompt": ["A robot may not injure another robot", "My name is"], "prompt": ["A robot may not injure another robot", "My name is"],
"model": model_name, "model": model_name,
} }
...@@ -334,7 +334,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI, ...@@ -334,7 +334,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
max_tokens=5, max_tokens=5,
temperature=0.0, temperature=0.0,
stream=True) stream=True)
chunks: List[str] = [] chunks: list[str] = []
finish_reason_count = 0 finish_reason_count = 0
async for chunk in stream: async for chunk in stream:
chunks.append(chunk.choices[0].text) chunks.append(chunk.choices[0].text)
...@@ -367,7 +367,7 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str): ...@@ -367,7 +367,7 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
max_tokens=max_tokens, max_tokens=max_tokens,
n=n, n=n,
stream=True) stream=True)
chunks: List[List[str]] = [[] for i in range(n)] chunks: list[list[str]] = [[] for i in range(n)]
finish_reason_count = 0 finish_reason_count = 0
async for chunk in stream: async for chunk in stream:
index = chunk.choices[0].index index = chunk.choices[0].index
......
...@@ -14,7 +14,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer ...@@ -14,7 +14,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
from ...utils import RemoteOpenAIServer, models_path_prefix from ...utils import RemoteOpenAIServer, models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct") MODEL_NAME = os.path.join(models_path_prefix, "intfloat/multilingual-e5-small")
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
...@@ -28,7 +28,7 @@ def server(): ...@@ -28,7 +28,7 @@ def server():
"bfloat16", "bfloat16",
"--enforce-eager", "--enforce-eager",
"--max-model-len", "--max-model-len",
"8192", "512",
"--chat-template", "--chat-template",
DUMMY_CHAT_TEMPLATE, DUMMY_CHAT_TEMPLATE,
] ]
...@@ -61,10 +61,10 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str): ...@@ -61,10 +61,10 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
assert embeddings.id is not None assert embeddings.id is not None
assert len(embeddings.data) == 1 assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096 assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0 assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 9 assert embeddings.usage.prompt_tokens == 11
assert embeddings.usage.total_tokens == 9 assert embeddings.usage.total_tokens == 11
# test using token IDs # test using token IDs
input_tokens = [1, 1, 1, 1, 1] input_tokens = [1, 1, 1, 1, 1]
...@@ -78,7 +78,7 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str): ...@@ -78,7 +78,7 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
assert embeddings.id is not None assert embeddings.id is not None
assert len(embeddings.data) == 1 assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096 assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0 assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 5 assert embeddings.usage.prompt_tokens == 5
assert embeddings.usage.total_tokens == 5 assert embeddings.usage.total_tokens == 5
...@@ -87,7 +87,7 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str): ...@@ -87,7 +87,7 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str): async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
# test List[str] # test list[str]
input_texts = [ input_texts = [
"The cat sat on the mat.", "A feline was resting on a rug.", "The cat sat on the mat.", "A feline was resting on a rug.",
"Stars twinkle brightly in the night sky." "Stars twinkle brightly in the night sky."
...@@ -102,12 +102,12 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str): ...@@ -102,12 +102,12 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
assert embeddings.id is not None assert embeddings.id is not None
assert len(embeddings.data) == 3 assert len(embeddings.data) == 3
assert len(embeddings.data[0].embedding) == 4096 assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0 assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 32 assert embeddings.usage.prompt_tokens == 33
assert embeddings.usage.total_tokens == 32 assert embeddings.usage.total_tokens == 33
# test List[List[int]] # test list[list[int]]
input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
[25, 32, 64, 77]] [25, 32, 64, 77]]
embedding_response = await client.embeddings.create( embedding_response = await client.embeddings.create(
...@@ -120,7 +120,7 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str): ...@@ -120,7 +120,7 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
assert embeddings.id is not None assert embeddings.id is not None
assert len(embeddings.data) == 4 assert len(embeddings.data) == 4
assert len(embeddings.data[0].embedding) == 4096 assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0 assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 17 assert embeddings.usage.prompt_tokens == 17
assert embeddings.usage.total_tokens == 17 assert embeddings.usage.total_tokens == 17
...@@ -235,7 +235,7 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI, ...@@ -235,7 +235,7 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
assert embeddings.id is not None assert embeddings.id is not None
assert len(embeddings.data) == 1 assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096 assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0 assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 10 assert embeddings.usage.prompt_tokens == 10
assert embeddings.usage.total_tokens == 10 assert embeddings.usage.total_tokens == 10
...@@ -253,7 +253,7 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI, ...@@ -253,7 +253,7 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
assert embeddings.id is not None assert embeddings.id is not None
assert len(embeddings.data) == 1 assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096 assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0 assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 10 assert embeddings.usage.prompt_tokens == 10
assert embeddings.usage.total_tokens == 10 assert embeddings.usage.total_tokens == 10
......
...@@ -7,7 +7,7 @@ import pytest_asyncio ...@@ -7,7 +7,7 @@ import pytest_asyncio
from ...utils import RemoteOpenAIServer, models_path_prefix from ...utils import RemoteOpenAIServer, models_path_prefix
from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP
from vllm.utils import is_hip from vllm.platforms import current_platform
MODEL_NAME = os.path.join(models_path_prefix, "facebook/bart-base") MODEL_NAME = os.path.join(models_path_prefix, "facebook/bart-base")
...@@ -30,7 +30,7 @@ async def client(server): ...@@ -30,7 +30,7 @@ async def client(server):
yield async_client yield async_client
@pytest.mark.skipif(is_hip(), @pytest.mark.skipif(current_platform.is_rocm(),
reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP) reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP)
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
......
...@@ -228,9 +228,11 @@ EXPECTED_METRICS_V1 = [ ...@@ -228,9 +228,11 @@ EXPECTED_METRICS_V1 = [
"vllm:gpu_cache_usage_perc", "vllm:gpu_cache_usage_perc",
"vllm:gpu_prefix_cache_queries", "vllm:gpu_prefix_cache_queries",
"vllm:gpu_prefix_cache_hits", "vllm:gpu_prefix_cache_hits",
"vllm:num_preemptions_total",
"vllm:prompt_tokens_total", "vllm:prompt_tokens_total",
"vllm:generation_tokens_total", "vllm:generation_tokens_total",
"vllm:iteration_tokens_total", "vllm:iteration_tokens_total",
"vllm:cache_config_info",
"vllm:request_success_total", "vllm:request_success_total",
"vllm:request_prompt_tokens_sum", "vllm:request_prompt_tokens_sum",
"vllm:request_prompt_tokens_bucket", "vllm:request_prompt_tokens_bucket",
...@@ -238,6 +240,12 @@ EXPECTED_METRICS_V1 = [ ...@@ -238,6 +240,12 @@ EXPECTED_METRICS_V1 = [
"vllm:request_generation_tokens_sum", "vllm:request_generation_tokens_sum",
"vllm:request_generation_tokens_bucket", "vllm:request_generation_tokens_bucket",
"vllm:request_generation_tokens_count", "vllm:request_generation_tokens_count",
"vllm:request_params_n_sum",
"vllm:request_params_n_bucket",
"vllm:request_params_n_count",
"vllm:request_params_max_tokens_sum",
"vllm:request_params_max_tokens_bucket",
"vllm:request_params_max_tokens_count",
"vllm:time_to_first_token_seconds_sum", "vllm:time_to_first_token_seconds_sum",
"vllm:time_to_first_token_seconds_bucket", "vllm:time_to_first_token_seconds_bucket",
"vllm:time_to_first_token_seconds_count", "vllm:time_to_first_token_seconds_count",
...@@ -281,7 +289,7 @@ async def test_metrics_exist(server: RemoteOpenAIServer, ...@@ -281,7 +289,7 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
def test_metrics_exist_run_batch(use_v1: bool): def test_metrics_exist_run_batch(use_v1: bool):
if use_v1: if use_v1:
pytest.skip("Skipping test on vllm V1") pytest.skip("Skipping test on vllm V1")
input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}""" # noqa: E501 input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}""" # noqa: E501
#base_url = "0.0.0.0" #base_url = "0.0.0.0"
base_url = "localhost" base_url = "localhost"
...@@ -302,7 +310,7 @@ def test_metrics_exist_run_batch(use_v1: bool): ...@@ -302,7 +310,7 @@ def test_metrics_exist_run_batch(use_v1: bool):
"-o", "-o",
output_file.name, output_file.name,
"--model", "--model",
os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"), os.path.join(models_path_prefix, "intfloat/multilingual-e5-small"),
"--enable-metrics", "--enable-metrics",
"--url", "--url",
base_url, base_url,
......
...@@ -84,7 +84,7 @@ async def test_single_pooling(server: RemoteOpenAIServer, model_name: str): ...@@ -84,7 +84,7 @@ async def test_single_pooling(server: RemoteOpenAIServer, model_name: str):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str): async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
# test List[str] # test list[str]
input_texts = [ input_texts = [
"The cat sat on the mat.", "A feline was resting on a rug.", "The cat sat on the mat.", "A feline was resting on a rug.",
"Stars twinkle brightly in the night sky." "Stars twinkle brightly in the night sky."
...@@ -107,7 +107,7 @@ async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str): ...@@ -107,7 +107,7 @@ async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
assert poolings.usage.prompt_tokens == 25 assert poolings.usage.prompt_tokens == 25
assert poolings.usage.total_tokens == 25 assert poolings.usage.total_tokens == 25
# test List[List[int]] # test list[list[int]]
input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
[25, 32, 64, 77]] [25, 32, 64, 77]]
response = requests.post( response = requests.post(
......
...@@ -8,17 +8,17 @@ from vllm.entrypoints.openai.protocol import RerankResponse ...@@ -8,17 +8,17 @@ from vllm.entrypoints.openai.protocol import RerankResponse
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
MODEL_NAME = "BAAI/bge-reranker-base" MODEL_NAME = "BAAI/bge-reranker-base"
DTYPE = "bfloat16"
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
args = ["--enforce-eager", "--max-model-len", "100"] args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_rerank_texts(server: RemoteOpenAIServer, model_name: str): def test_rerank_texts(server: RemoteOpenAIServer, model_name: str):
query = "What is the capital of France?" query = "What is the capital of France?"
...@@ -42,7 +42,6 @@ def test_rerank_texts(server: RemoteOpenAIServer, model_name: str): ...@@ -42,7 +42,6 @@ def test_rerank_texts(server: RemoteOpenAIServer, model_name: str):
assert rerank.results[1].relevance_score <= 0.01 assert rerank.results[1].relevance_score <= 0.01
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_top_n(server: RemoteOpenAIServer, model_name: str): def test_top_n(server: RemoteOpenAIServer, model_name: str):
query = "What is the capital of France?" query = "What is the capital of France?"
...@@ -68,7 +67,6 @@ def test_top_n(server: RemoteOpenAIServer, model_name: str): ...@@ -68,7 +67,6 @@ def test_top_n(server: RemoteOpenAIServer, model_name: str):
assert rerank.results[1].relevance_score <= 0.01 assert rerank.results[1].relevance_score <= 0.01
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str): def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str):
......
...@@ -17,18 +17,28 @@ from .test_completion import MODEL_NAME ...@@ -17,18 +17,28 @@ from .test_completion import MODEL_NAME
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server_with_return_tokens_as_token_ids_flag( def server_fixture(request, default_server_args): # noqa: F811
default_server_args): # noqa: F811 use_server_flag = request.param
args_with_flag = default_server_args + ["--return-tokens-as-token-ids"] if use_server_flag:
with RemoteOpenAIServer(MODEL_NAME, args_with_flag) as remote_server: args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]
yield remote_server with RemoteOpenAIServer(MODEL_NAME, args_with_flag) as remote_server:
yield (remote_server, True)
else:
with RemoteOpenAIServer(MODEL_NAME,
default_server_args) as remote_server:
yield (remote_server, False)
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("server_fixture", [True, False], indirect=True)
async def test_completion_return_tokens_as_token_ids_completion( async def test_completion_return_tokens_as_token_ids_completion(
server_with_return_tokens_as_token_ids_flag): server_fixture):
async with server_with_return_tokens_as_token_ids_flag.get_async_client( server, use_server_flag = server_fixture
) as client: request_args = {}
if not use_server_flag:
request_args["return_tokens_as_token_ids"] = True
async with server.get_async_client() as client:
completion = await client.completions.create( completion = await client.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
...@@ -39,7 +49,8 @@ async def test_completion_return_tokens_as_token_ids_completion( ...@@ -39,7 +49,8 @@ async def test_completion_return_tokens_as_token_ids_completion(
echo=True, echo=True,
temperature=0, temperature=0,
max_tokens=10, max_tokens=10,
logprobs=1) logprobs=1,
extra_body=request_args)
text = completion.choices[0].text text = completion.choices[0].text
token_strs = completion.choices[0].logprobs.tokens token_strs = completion.choices[0].logprobs.tokens
...@@ -60,10 +71,14 @@ async def test_completion_return_tokens_as_token_ids_completion( ...@@ -60,10 +71,14 @@ async def test_completion_return_tokens_as_token_ids_completion(
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_chat_return_tokens_as_token_ids_completion( @pytest.mark.parametrize("server_fixture", [True, False], indirect=True)
server_with_return_tokens_as_token_ids_flag): async def test_chat_return_tokens_as_token_ids_completion(server_fixture):
async with server_with_return_tokens_as_token_ids_flag.get_async_client( server, use_server_flag = server_fixture
) as client: request_args = {}
if not use_server_flag:
request_args["return_tokens_as_token_ids"] = True
async with server.get_async_client() as client:
response = await client.chat.completions.create( response = await client.chat.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
# Include Unicode characters to test for dividing a single # Include Unicode characters to test for dividing a single
...@@ -78,7 +93,8 @@ async def test_chat_return_tokens_as_token_ids_completion( ...@@ -78,7 +93,8 @@ async def test_chat_return_tokens_as_token_ids_completion(
}], }],
temperature=0, temperature=0,
max_tokens=8, max_tokens=8,
logprobs=True) logprobs=True,
extra_body=request_args)
text = response.choices[0].message.content text = response.choices[0].message.content
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment