Commit 500b93c8 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.5.3.post1' into v0.5.3.post1-dtk24.04.1

parents 99426767 38c4b7e8
import os import os
import openai # use the official client for correctness check
import pytest import pytest
from ..utils import RemoteOpenAIServer from ..utils import compare_two_settings
# downloading lora to test lora requests VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
# any model with a chat template should work here
MODEL_NAME = "meta-llama/Meta-Llama-3-8B"
EAGER_MODE = bool(int(os.getenv("EAGER_MODE", 0)))
CHUNKED_PREFILL = bool(int(os.getenv("CHUNKED_PREFILL", 0)))
TP_SIZE = int(os.getenv("TP_SIZE", 1))
PP_SIZE = int(os.getenv("PP_SIZE", 1))
pytestmark = pytest.mark.asyncio @pytest.mark.parametrize(
"TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME, DIST_BACKEND",
[
@pytest.fixture(scope="module") (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
def server(): (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
args = [ (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
"--model", (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
MODEL_NAME, (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
(2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
(2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
(1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
(1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
(1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
])
def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
DIST_BACKEND):
if VLLM_MULTI_NODE and DIST_BACKEND == "mp":
pytest.skip("Skipping multi-node pipeline parallel test for "
"multiprocessing distributed backend")
pp_args = [
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",
"bfloat16", "float16",
"--pipeline-parallel-size", "--pipeline-parallel-size",
str(PP_SIZE), str(PP_SIZE),
"--tensor-parallel-size", "--tensor-parallel-size",
str(TP_SIZE), str(TP_SIZE),
"--distributed-executor-backend", "--distributed-executor-backend",
"ray", DIST_BACKEND,
]
# compare without pipeline parallelism
# NOTE: use mp backend for TP
# PP tests might involve multiple nodes, and ray might
# schedule all workers in a node other than the head node,
# which can cause the test to fail.
tp_args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--tensor-parallel-size",
str(max(TP_SIZE, 2)), # We only use 2 GPUs in the CI.
"--distributed-executor-backend",
"mp",
] ]
if CHUNKED_PREFILL: if CHUNKED_PREFILL:
args += [ pp_args.append("--enable-chunked-prefill")
"--enable-chunked-prefill", tp_args.append("--enable-chunked-prefill")
]
if EAGER_MODE: if EAGER_MODE:
args += [ pp_args.append("--enforce-eager")
"--enforce-eager", tp_args.append("--enforce-eager")
]
with RemoteOpenAIServer(args) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
def client(server):
return server.get_async_client()
async def test_check_models(server, client: openai.AsyncOpenAI):
models = await client.models.list()
models = models.data
served_model = models[0]
assert served_model.id == MODEL_NAME
assert all(model.root == MODEL_NAME for model in models)
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_single_completion(server, client: openai.AsyncOpenAI,
model_name: str):
completion = await client.completions.create(model=model_name,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0)
assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 1
assert completion.choices[0].text is not None and len(
completion.choices[0].text) >= 5
assert completion.choices[0].finish_reason == "length"
assert completion.usage == openai.types.CompletionUsage(
completion_tokens=5, prompt_tokens=6, total_tokens=11)
# test using token IDs
completion = await client.completions.create(
model=MODEL_NAME,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
)
assert completion.choices[0].text is not None and len(
completion.choices[0].text) >= 5
@pytest.mark.parametrize(
# just test 1 lora hereafter
"model_name",
[MODEL_NAME],
)
async def test_batch_completions(server, client: openai.AsyncOpenAI,
model_name: str):
# test simple list
batch = await client.completions.create(
model=model_name,
prompt=["Hello, my name is", "Hello, my name is"],
max_tokens=5,
temperature=0.0,
)
assert len(batch.choices) == 2
assert batch.choices[0].text == batch.choices[1].text
# test n = 2
batch = await client.completions.create(
model=model_name,
prompt=["Hello, my name is", "Hello, my name is"],
n=2,
max_tokens=5,
temperature=0.0,
extra_body=dict(
# NOTE: this has to be true for n > 1 in vLLM, but not necessary
# for official client.
use_beam_search=True),
)
assert len(batch.choices) == 4
assert batch.choices[0].text != batch.choices[
1].text, "beam search should be different"
assert batch.choices[0].text == batch.choices[
2].text, "two copies of the same prompt should be the same"
assert batch.choices[1].text == batch.choices[
3].text, "two copies of the same prompt should be the same"
# test streaming compare_two_settings(MODEL_NAME, pp_args, tp_args)
batch = await client.completions.create(
model=model_name,
prompt=["Hello, my name is", "Hello, my name is"],
max_tokens=5,
temperature=0.0,
stream=True,
)
texts = [""] * 2
async for chunk in batch:
assert len(chunk.choices) == 1
choice = chunk.choices[0]
texts[choice.index] += choice.text
assert texts[0] == texts[1]
...@@ -35,8 +35,8 @@ def sequence_with_eos(text: str, eos_token: str, ...@@ -35,8 +35,8 @@ def sequence_with_eos(text: str, eos_token: str,
@pytest.mark.parametrize(["text_wo_eos", "eos_token", "eos_token_id"], [ @pytest.mark.parametrize(["text_wo_eos", "eos_token", "eos_token_id"], [
("This text ends with EOS token", "</s>", 2), ("This text ends with EOS token", "</s>", 2),
]) ])
@pytest.mark.parametrize("ignore_eos", [True, False, None]) @pytest.mark.parametrize("ignore_eos", [True, False])
@pytest.mark.parametrize("include_stop_str_in_output", [True, False, None]) @pytest.mark.parametrize("include_stop_str_in_output", [True, False])
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
def test_stop_on_eos_token(text_wo_eos: str, eos_token: str, eos_token_id: int, def test_stop_on_eos_token(text_wo_eos: str, eos_token: str, eos_token_id: int,
ignore_eos: bool, include_stop_str_in_output: bool): ignore_eos: bool, include_stop_str_in_output: bool):
......
import asyncio
import os
import pytest
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.llm_engine import LLMEngine
from vllm.executor.gpu_executor import GPUExecutor, GPUExecutorAsync
from vllm.sampling_params import SamplingParams
class Mock:
...
class CustomGPUExecutor(GPUExecutor):
def execute_model(self, *args, **kwargs):
# Drop marker to show that this was ran
with open(".marker", "w"):
...
return super().execute_model(*args, **kwargs)
class CustomGPUExecutorAsync(GPUExecutorAsync):
async def execute_model_async(self, *args, **kwargs):
with open(".marker", "w"):
...
return await super().execute_model_async(*args, **kwargs)
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
def test_custom_executor_type_checking(model):
with pytest.raises(ValueError):
engine_args = EngineArgs(model=model,
distributed_executor_backend=Mock)
LLMEngine.from_engine_args(engine_args)
with pytest.raises(ValueError):
engine_args = AsyncEngineArgs(model=model,
distributed_executor_backend=Mock)
AsyncLLMEngine.from_engine_args(engine_args)
with pytest.raises(TypeError):
engine_args = AsyncEngineArgs(
model=model, distributed_executor_backend=CustomGPUExecutor)
AsyncLLMEngine.from_engine_args(engine_args)
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
def test_custom_executor(model, tmpdir):
cwd = os.path.abspath(".")
os.chdir(tmpdir)
try:
assert not os.path.exists(".marker")
engine_args = EngineArgs(
model=model, distributed_executor_backend=CustomGPUExecutor)
engine = LLMEngine.from_engine_args(engine_args)
sampling_params = SamplingParams(max_tokens=1)
engine.add_request("0", "foo", sampling_params)
engine.step()
assert os.path.exists(".marker")
finally:
os.chdir(cwd)
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
def test_custom_executor_async(model, tmpdir):
cwd = os.path.abspath(".")
os.chdir(tmpdir)
try:
assert not os.path.exists(".marker")
engine_args = AsyncEngineArgs(
model=model, distributed_executor_backend=CustomGPUExecutorAsync)
engine = AsyncLLMEngine.from_engine_args(engine_args)
sampling_params = SamplingParams(max_tokens=1)
async def t():
stream = await engine.add_request("0", "foo", sampling_params)
async for x in stream:
...
asyncio.run(t())
assert os.path.exists(".marker")
finally:
os.chdir(cwd)
from http import HTTPStatus
import openai
import pytest
import requests
from vllm.version import __version__ as VLLM_VERSION
from ...utils import RemoteOpenAIServer
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@pytest.fixture(scope="module")
def server():
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
"--max-num-seqs",
"128",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
def client(server):
return server.get_async_client()
@pytest.mark.asyncio
async def test_show_version(client: openai.AsyncOpenAI):
base_url = str(client.base_url)[:-3].strip("/")
response = requests.get(base_url + "/version")
response.raise_for_status()
assert response.json() == {"version": VLLM_VERSION}
@pytest.mark.asyncio
async def test_check_health(client: openai.AsyncOpenAI):
base_url = str(client.base_url)[:-3].strip("/")
response = requests.get(base_url + "/health")
assert response.status_code == HTTPStatus.OK
@pytest.mark.asyncio
async def test_log_metrics(client: openai.AsyncOpenAI):
base_url = str(client.base_url)[:-3].strip("/")
response = requests.get(base_url + "/metrics")
assert response.status_code == HTTPStatus.OK
...@@ -7,11 +7,11 @@ import jsonschema ...@@ -7,11 +7,11 @@ import jsonschema
import openai # use the official client for correctness check import openai # use the official client for correctness check
import pytest import pytest
import torch import torch
# downloading lora to test lora requests
from huggingface_hub import snapshot_download
from openai import BadRequestError from openai import BadRequestError
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
from .test_completion import zephyr_lora_added_tokens_files # noqa: F401
from .test_completion import zephyr_lora_files # noqa: F401
# any model with a chat template should work here # any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
...@@ -21,33 +21,28 @@ LORA_NAME = "typeof/zephyr-7b-beta-lora" ...@@ -21,33 +21,28 @@ LORA_NAME = "typeof/zephyr-7b-beta-lora"
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def zephyr_lora_files(): def server(zephyr_lora_files, zephyr_lora_added_tokens_files): # noqa: F811
return snapshot_download(repo_id=LORA_NAME) args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
@pytest.fixture(scope="module") "bfloat16",
def server(zephyr_lora_files): "--max-model-len",
with RemoteOpenAIServer([ "8192",
"--model", "--enforce-eager",
MODEL_NAME, # lora config below
# use half precision for speed and memory savings in CI environment "--enable-lora",
"--dtype", "--lora-modules",
"bfloat16", f"zephyr-lora={zephyr_lora_files}",
"--max-model-len", f"zephyr-lora2={zephyr_lora_added_tokens_files}",
"8192", "--max-lora-rank",
"--enforce-eager", "64",
# lora config below "--max-cpu-loras",
"--enable-lora", "2",
"--lora-modules", "--max-num-seqs",
f"zephyr-lora={zephyr_lora_files}", "128",
f"zephyr-lora2={zephyr_lora_files}", ]
"--max-lora-rank",
"64", with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
"--max-cpu-loras",
"2",
"--max-num-seqs",
"128",
]) as remote_server:
yield remote_server yield remote_server
......
# imports for guided decoding tests # imports for guided decoding tests
import json import json
import re import re
import shutil
from tempfile import TemporaryDirectory
from typing import List from typing import List
import jsonschema import jsonschema
import openai # use the official client for correctness check import openai # use the official client for correctness check
import pytest import pytest
import requests
# downloading lora to test lora requests # downloading lora to test lora requests
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from openai import BadRequestError from openai import BadRequestError
from transformers import AutoTokenizer
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
...@@ -17,9 +19,13 @@ from ...utils import RemoteOpenAIServer ...@@ -17,9 +19,13 @@ from ...utils import RemoteOpenAIServer
# any model with a chat template should work here # any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
# technically this needs Mistral-7B-v0.1 as base, but we're not testing # technically these adapters use a different base model,
# generation quality here # but we're not testing generation quality here
LORA_NAME = "typeof/zephyr-7b-beta-lora" LORA_NAME = "typeof/zephyr-7b-beta-lora"
PA_NAME = "swapnilbp/llama_tweet_ptune"
# if PA_NAME changes, PA_NUM_VIRTUAL_TOKENS might also
# need to change to match the prompt adapter
PA_NUM_VIRTUAL_TOKENS = 8
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
...@@ -28,28 +34,58 @@ def zephyr_lora_files(): ...@@ -28,28 +34,58 @@ def zephyr_lora_files():
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(zephyr_lora_files): def zephyr_lora_added_tokens_files(zephyr_lora_files):
with RemoteOpenAIServer([ tmp_dir = TemporaryDirectory()
"--model", tmp_model_dir = f"{tmp_dir.name}/zephyr"
MODEL_NAME, shutil.copytree(zephyr_lora_files, tmp_model_dir)
# use half precision for speed and memory savings in CI environment tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
"--dtype", # Copy tokenizer to adapter and add some unique tokens
"bfloat16", # 32000, 32001, 32002
"--max-model-len", added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
"8192", special_tokens=True)
"--enforce-eager", assert added == 3
# lora config below tokenizer.save_pretrained(tmp_model_dir)
"--enable-lora", yield tmp_model_dir
"--lora-modules", tmp_dir.cleanup()
f"zephyr-lora={zephyr_lora_files}",
f"zephyr-lora2={zephyr_lora_files}",
"--max-lora-rank", @pytest.fixture(scope="module")
"64", def zephyr_pa_files():
"--max-cpu-loras", return snapshot_download(repo_id=PA_NAME)
"2",
"--max-num-seqs",
"128", @pytest.fixture(scope="module")
]) as remote_server: def server(zephyr_lora_files, zephyr_lora_added_tokens_files, zephyr_pa_files):
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--max-num-seqs",
"128",
"--enforce-eager",
# lora config
"--enable-lora",
"--lora-modules",
f"zephyr-lora={zephyr_lora_files}",
f"zephyr-lora2={zephyr_lora_added_tokens_files}",
"--max-lora-rank",
"64",
"--max-cpu-loras",
"2",
# pa config
"--enable-prompt-adapter",
"--prompt-adapters",
f"zephyr-pa={zephyr_pa_files}",
f"zephyr-pa2={zephyr_pa_files}",
"--max-prompt-adapters",
"2",
"--max-prompt-adapter-token",
"128",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server
...@@ -60,11 +96,14 @@ def client(server): ...@@ -60,11 +96,14 @@ def client(server):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
# first test base model, then test loras # first test base model, then test loras, then test prompt adapters
"model_name", "model_name,num_virtual_tokens",
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"], [(MODEL_NAME, 0), ("zephyr-lora", 0), ("zephyr-lora2", 0),
("zephyr-pa", PA_NUM_VIRTUAL_TOKENS),
("zephyr-pa2", PA_NUM_VIRTUAL_TOKENS)],
) )
async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): async def test_single_completion(client: openai.AsyncOpenAI, model_name: str,
num_virtual_tokens: int):
completion = await client.completions.create(model=model_name, completion = await client.completions.create(model=model_name,
prompt="Hello, my name is", prompt="Hello, my name is",
max_tokens=5, max_tokens=5,
...@@ -77,28 +116,58 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): ...@@ -77,28 +116,58 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
assert len(choice.text) >= 5 assert len(choice.text) >= 5
assert choice.finish_reason == "length" assert choice.finish_reason == "length"
assert completion.usage == openai.types.CompletionUsage( assert completion.usage == openai.types.CompletionUsage(
completion_tokens=5, prompt_tokens=6, total_tokens=11) completion_tokens=5,
prompt_tokens=6 + num_virtual_tokens,
total_tokens=11 + num_virtual_tokens)
# test using token IDs # test using token IDs
completion = await client.completions.create( completion = await client.completions.create(
model=MODEL_NAME, model=model_name,
prompt=[0, 0, 0, 0, 0], prompt=[0, 0, 0, 0, 0],
max_tokens=5, max_tokens=5,
temperature=0.0, temperature=0.0,
) )
assert len(completion.choices[0].text) >= 5 assert len(completion.choices[0].text) >= 1
@pytest.mark.asyncio
async def test_added_lora_tokens(client: openai.AsyncOpenAI):
# test using token IDs
completion = await client.completions.create(
model="zephyr-lora2",
prompt=[0, 0, 32000, 32001, 32002],
echo=True,
max_tokens=5,
temperature=0.0,
)
# Added tokens should appear in tokenized prompt
assert completion.choices[0].text.startswith("<unk><unk>vllm1vllm2vllm3")
@pytest.mark.asyncio
async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
# test using token IDs
completion = await client.completions.create(
model=MODEL_NAME,
prompt=[0, 0, 32000, 32001, 32002],
echo=True,
max_tokens=5,
temperature=0.0,
)
# Added tokens should not appear in tokenized prompt
assert "vllm" not in completion.choices[0].text
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
# first test base model, then test loras # first test base model, then test loras, then test prompt adapters
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"], [MODEL_NAME, "zephyr-lora", "zephyr-lora2", "zephyr-pa", "zephyr-pa2"],
) )
async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str): async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
# test using token IDs # test using token IDs
completion = await client.completions.create( completion = await client.completions.create(
model=MODEL_NAME, model=model_name,
prompt=[0, 0, 0, 0, 0], prompt=[0, 0, 0, 0, 0],
max_tokens=5, max_tokens=5,
temperature=0.0, temperature=0.0,
...@@ -110,14 +179,14 @@ async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str): ...@@ -110,14 +179,14 @@ async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
# just test 1 lora hereafter # just test 1 lora and 1 pa hereafter
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora"], [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
) )
async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str): async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
# test using token IDs # test using token IDs
completion = await client.completions.create( completion = await client.completions.create(
model=MODEL_NAME, model=model_name,
prompt=[0, 0, 0, 0, 0], prompt=[0, 0, 0, 0, 0],
max_tokens=5, max_tokens=5,
temperature=0.0, temperature=0.0,
...@@ -133,12 +202,12 @@ async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str): ...@@ -133,12 +202,12 @@ async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora"], [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
) )
async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str): async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
# test using token IDs # test using token IDs
completion = await client.completions.create( completion = await client.completions.create(
model=MODEL_NAME, model=model_name,
prompt=[0, 0, 0, 0, 0], prompt=[0, 0, 0, 0, 0],
max_tokens=5, max_tokens=5,
temperature=0.0, temperature=0.0,
...@@ -154,7 +223,7 @@ async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str): ...@@ -154,7 +223,7 @@ async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora"], [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
) )
async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI, async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
model_name: str): model_name: str):
...@@ -162,7 +231,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI, ...@@ -162,7 +231,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
with pytest.raises( with pytest.raises(
(openai.BadRequestError, openai.APIError)): # test using token IDs (openai.BadRequestError, openai.APIError)): # test using token IDs
await client.completions.create( await client.completions.create(
model=MODEL_NAME, model=model_name,
prompt=[0, 0, 0, 0, 0], prompt=[0, 0, 0, 0, 0],
max_tokens=5, max_tokens=5,
temperature=0.0, temperature=0.0,
...@@ -174,7 +243,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI, ...@@ -174,7 +243,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
with pytest.raises( with pytest.raises(
(openai.BadRequestError, openai.APIError)): # test using token IDs (openai.BadRequestError, openai.APIError)): # test using token IDs
stream = await client.completions.create( stream = await client.completions.create(
model=MODEL_NAME, model=model_name,
prompt=[0, 0, 0, 0, 0], prompt=[0, 0, 0, 0, 0],
max_tokens=5, max_tokens=5,
temperature=0.0, temperature=0.0,
...@@ -199,7 +268,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI, ...@@ -199,7 +268,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora"], [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
) )
async def test_completion_streaming(client: openai.AsyncOpenAI, async def test_completion_streaming(client: openai.AsyncOpenAI,
model_name: str): model_name: str):
...@@ -233,7 +302,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI, ...@@ -233,7 +302,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model_name", "model_name",
["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"], [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
) )
async def test_completion_stream_options(client: openai.AsyncOpenAI, async def test_completion_stream_options(client: openai.AsyncOpenAI,
model_name: str): model_name: str):
...@@ -369,9 +438,8 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI, ...@@ -369,9 +438,8 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
# just test 1 lora hereafter
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora"], [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
) )
async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str): async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
# test both text and token IDs # test both text and token IDs
...@@ -614,51 +682,3 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI, ...@@ -614,51 +682,3 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
prompt="Give an example string that fits this regex", prompt="Give an example string that fits this regex",
extra_body=dict(guided_regex=sample_regex, extra_body=dict(guided_regex=sample_regex,
guided_json=sample_json_schema)) guided_json=sample_json_schema))
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_tokenize(client: openai.AsyncOpenAI, model_name: str):
base_url = str(client.base_url)[:-3].strip("/")
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME, tokenizer_mode="fast")
for add_special in [False, True]:
prompt = "This is a test prompt."
tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
response = requests.post(base_url + "/tokenize",
json={
"add_special_tokens": add_special,
"model": model_name,
"prompt": prompt
})
response.raise_for_status()
assert response.json() == {
"tokens": tokens,
"count": len(tokens),
"max_model_len": 8192
}
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_detokenize(client: openai.AsyncOpenAI, model_name: str):
base_url = str(client.base_url)[:-3]
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME, tokenizer_mode="fast")
prompt = "This is a test prompt."
tokens = tokenizer.encode(prompt, add_special_tokens=False)
response = requests.post(base_url + "detokenize",
json={
"model": model_name,
"tokens": tokens
})
response.raise_for_status()
assert response.json() == {"prompt": prompt}
...@@ -11,17 +11,17 @@ EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct" ...@@ -11,17 +11,17 @@ EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def embedding_server(): def embedding_server():
with RemoteOpenAIServer([ args = [
"--model", # use half precision for speed and memory savings in CI environment
EMBEDDING_MODEL_NAME, "--dtype",
# use half precision for speed and memory savings in CI environment "bfloat16",
"--dtype", "--enforce-eager",
"bfloat16", "--max-model-len",
"--enforce-eager", "8192",
"--max-model-len", "--enforce-eager",
"8192", ]
"--enforce-eager",
]) as remote_server: with RemoteOpenAIServer(EMBEDDING_MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server
......
...@@ -19,27 +19,27 @@ def zephyr_lora_files(): ...@@ -19,27 +19,27 @@ def zephyr_lora_files():
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(zephyr_lora_files): def server(zephyr_lora_files):
with RemoteOpenAIServer([ args = [
"--model", # use half precision for speed and memory savings in CI environment
MODEL_NAME, "--dtype",
# use half precision for speed and memory savings in CI environment "bfloat16",
"--dtype", "--max-model-len",
"bfloat16", "8192",
"--max-model-len", "--enforce-eager",
"8192", # lora config below
"--enforce-eager", "--enable-lora",
# lora config below "--lora-modules",
"--enable-lora", f"zephyr-lora={zephyr_lora_files}",
"--lora-modules", f"zephyr-lora2={zephyr_lora_files}",
f"zephyr-lora={zephyr_lora_files}", "--max-lora-rank",
f"zephyr-lora2={zephyr_lora_files}", "64",
"--max-lora-rank", "--max-cpu-loras",
"64", "2",
"--max-cpu-loras", "--max-num-seqs",
"2", "128",
"--max-num-seqs", ]
"128",
]) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server
......
...@@ -32,11 +32,13 @@ async def _async_serving_chat_init(): ...@@ -32,11 +32,13 @@ async def _async_serving_chat_init():
model_config, model_config,
served_model_names=[MODEL_NAME], served_model_names=[MODEL_NAME],
response_role="assistant", response_role="assistant",
chat_template=CHAT_TEMPLATE) chat_template=CHAT_TEMPLATE,
lora_modules=None,
prompt_adapters=None,
request_logger=None)
return serving_completion return serving_completion
def test_async_serving_chat_init(): def test_async_serving_chat_init():
serving_completion = asyncio.run(_async_serving_chat_init()) serving_completion = asyncio.run(_async_serving_chat_init())
assert serving_completion.tokenizer is not None assert serving_completion.chat_template == CHAT_TEMPLATE
assert serving_completion.tokenizer.chat_template == CHAT_TEMPLATE
import openai # use the official client for correctness check
import pytest
import requests
from vllm.transformers_utils.tokenizer import get_tokenizer
from ...utils import RemoteOpenAIServer
from .test_completion import zephyr_lora_added_tokens_files # noqa: F401
from .test_completion import zephyr_lora_files # noqa: F401
# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@pytest.fixture(scope="module")
def server(zephyr_lora_added_tokens_files: str): # noqa: F811
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
"--max-num-seqs",
"128",
# lora config
"--enable-lora",
"--lora-modules",
f"zephyr-lora2={zephyr_lora_added_tokens_files}",
"--max-lora-rank",
"64",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
def tokenizer_name(model_name: str,
zephyr_lora_added_tokens_files: str): # noqa: F811
return zephyr_lora_added_tokens_files if (
model_name == "zephyr-lora2") else model_name
@pytest.fixture(scope="module")
def client(server):
return server.get_async_client()
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name,tokenizer_name",
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
indirect=["tokenizer_name"],
)
async def test_tokenize_completions(client: openai.AsyncOpenAI,
model_name: str, tokenizer_name: str):
base_url = str(client.base_url)[:-3].strip("/")
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
tokenizer_mode="fast")
for add_special in [False, True]:
prompt = "vllm1 This is a test prompt."
tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
response = requests.post(base_url + "/tokenize",
json={
"add_special_tokens": add_special,
"model": model_name,
"prompt": prompt
})
response.raise_for_status()
assert response.json() == {
"tokens": tokens,
"count": len(tokens),
"max_model_len": 8192
}
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name,tokenizer_name",
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
indirect=["tokenizer_name"],
)
async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
tokenizer_name: str):
base_url = str(client.base_url)[:-3].strip("/")
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
tokenizer_mode="fast")
for add_generation in [False, True]:
for add_special in [False, True]:
conversation = [{
"role": "user",
"content": "Hi there!"
}, {
"role": "assistant",
"content": "Nice to meet you!"
}, {
"role": "user",
"content": "Can I ask a question? vllm1"
}]
prompt = tokenizer.apply_chat_template(
add_generation_prompt=add_generation,
conversation=conversation,
tokenize=False)
tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
response = requests.post(base_url + "/tokenize",
json={
"add_generation_prompt":
add_generation,
"add_special_tokens": add_special,
"messages": conversation,
"model": model_name
})
response.raise_for_status()
assert response.json() == {
"tokens": tokens,
"count": len(tokens),
"max_model_len": 8192
}
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name,tokenizer_name",
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
indirect=["tokenizer_name"],
)
async def test_detokenize(client: openai.AsyncOpenAI, model_name: str,
tokenizer_name: str):
base_url = str(client.base_url)[:-3].strip("/")
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
tokenizer_mode="fast")
prompt = "This is a test prompt. vllm1"
tokens = tokenizer.encode(prompt, add_special_tokens=False)
print(f"CALLING {base_url} FOR {model_name}")
response = requests.post(base_url + "/detokenize",
json={
"model": model_name,
"tokens": tokens
})
response.raise_for_status()
assert response.json() == {"prompt": prompt}
...@@ -2,9 +2,8 @@ from typing import Dict, List ...@@ -2,9 +2,8 @@ from typing import Dict, List
import openai import openai
import pytest import pytest
import pytest_asyncio
from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64 from vllm.multimodal.utils import encode_image_base64, fetch_image
from ...utils import VLLM_PATH, RemoteOpenAIServer from ...utils import VLLM_PATH, RemoteOpenAIServer
...@@ -23,17 +22,17 @@ TEST_IMAGE_URLS = [ ...@@ -23,17 +22,17 @@ TEST_IMAGE_URLS = [
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
with RemoteOpenAIServer([ args = [
"--model", "--dtype",
MODEL_NAME, "bfloat16",
"--dtype", "--max-model-len",
"bfloat16", "4096",
"--max-model-len", "--enforce-eager",
"4096", "--chat-template",
"--enforce-eager", str(LLAVA_CHAT_TEMPLATE),
"--chat-template", ]
str(LLAVA_CHAT_TEMPLATE),
]) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server
...@@ -42,11 +41,10 @@ def client(server): ...@@ -42,11 +41,10 @@ def client(server):
return server.get_async_client() return server.get_async_client()
@pytest_asyncio.fixture(scope="session") @pytest.fixture(scope="session")
async def base64_encoded_image() -> Dict[str, str]: def base64_encoded_image() -> Dict[str, str]:
return { return {
image_url: image_url: encode_image_base64(fetch_image(image_url))
encode_image_base64(await ImageFetchAiohttp.fetch_image(image_url))
for image_url in TEST_IMAGE_URLS for image_url in TEST_IMAGE_URLS
} }
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment