Commit b9e12416 authored by zhuwenwen's avatar zhuwenwen
Browse files

merge v0.4.3

parents e5d707db e9d3aa04
...@@ -11,12 +11,13 @@ import torch ...@@ -11,12 +11,13 @@ import torch
from vllm.distributed import (broadcast_tensor_dict, from vllm.distributed import (broadcast_tensor_dict,
tensor_model_parallel_all_gather, tensor_model_parallel_all_gather,
tensor_model_parallel_all_reduce) tensor_model_parallel_all_reduce)
from vllm.test_utils import (init_test_distributed_environment,
multi_process_tensor_parallel) from ..utils import (init_test_distributed_environment,
multi_process_tensor_parallel)
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def all_reduce_test_worker(tensor_parallel_size: int, rank: int, def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
distributed_init_port: str): distributed_init_port: str):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs # so that each worker can see all the GPUs
...@@ -24,12 +25,12 @@ def all_reduce_test_worker(tensor_parallel_size: int, rank: int, ...@@ -24,12 +25,12 @@ def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
del os.environ["CUDA_VISIBLE_DEVICES"] del os.environ["CUDA_VISIBLE_DEVICES"]
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(1, tensor_parallel_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
distributed_init_port) distributed_init_port)
num_elements = 8 num_elements = 8
all_tensors = [ all_tensors = [
torch.arange(num_elements, dtype=torch.float32, device="cuda") * torch.arange(num_elements, dtype=torch.float32, device="cuda") *
(r + 1) for r in range(tensor_parallel_size) (r + 1) for r in range(tp_size)
] ]
expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0) expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
t = all_tensors[rank] t = all_tensors[rank]
...@@ -38,7 +39,7 @@ def all_reduce_test_worker(tensor_parallel_size: int, rank: int, ...@@ -38,7 +39,7 @@ def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def all_gather_test_worker(tensor_parallel_size: int, rank: int, def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
distributed_init_port: str): distributed_init_port: str):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs # so that each worker can see all the GPUs
...@@ -46,7 +47,7 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int, ...@@ -46,7 +47,7 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
del os.environ["CUDA_VISIBLE_DEVICES"] del os.environ["CUDA_VISIBLE_DEVICES"]
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(1, tensor_parallel_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
distributed_init_port) distributed_init_port)
num_dimensions = 3 num_dimensions = 3
tensor_size = list(range(2, num_dimensions + 2)) tensor_size = list(range(2, num_dimensions + 2))
...@@ -57,7 +58,7 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int, ...@@ -57,7 +58,7 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
all_tensors = [ all_tensors = [
torch.arange(total_size, dtype=torch.float32, torch.arange(total_size, dtype=torch.float32,
device="cuda").reshape(tensor_size) * (r + 1) device="cuda").reshape(tensor_size) * (r + 1)
for r in range(tensor_parallel_size) for r in range(tp_size)
] ]
expected = torch.cat(all_tensors, dim=all_gather_dimension) expected = torch.cat(all_tensors, dim=all_gather_dimension)
t = all_tensors[rank] t = all_tensors[rank]
...@@ -66,7 +67,7 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int, ...@@ -66,7 +67,7 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def broadcast_tensor_dict_test_worker(tensor_parallel_size: int, rank: int, def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
distributed_init_port: str): distributed_init_port: str):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs # so that each worker can see all the GPUs
...@@ -74,17 +75,21 @@ def broadcast_tensor_dict_test_worker(tensor_parallel_size: int, rank: int, ...@@ -74,17 +75,21 @@ def broadcast_tensor_dict_test_worker(tensor_parallel_size: int, rank: int,
del os.environ["CUDA_VISIBLE_DEVICES"] del os.environ["CUDA_VISIBLE_DEVICES"]
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(1, tensor_parallel_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
distributed_init_port) distributed_init_port)
test_dict = { test_dict = {
# device tensor
"a": torch.arange(8, dtype=torch.float32, device="cuda"), "a": torch.arange(8, dtype=torch.float32, device="cuda"),
"b": torch.arange(16, dtype=torch.int8, device="cuda"), # CPU tensor
"b": torch.arange(16, dtype=torch.int8, device="cpu"),
"c": "test", "c": "test",
"d": [1, 2, 3], "d": [1, 2, 3],
"e": { "e": {
"a": 1, "a": 1,
"b": 2 "b": 2
}, },
# empty tensor
"f": torch.tensor([], dtype=torch.float32, device="cuda"),
} }
if rank == 0: if rank == 0:
...@@ -97,14 +102,15 @@ def broadcast_tensor_dict_test_worker(tensor_parallel_size: int, rank: int, ...@@ -97,14 +102,15 @@ def broadcast_tensor_dict_test_worker(tensor_parallel_size: int, rank: int,
assert recv_dict["c"] == test_dict["c"] assert recv_dict["c"] == test_dict["c"]
assert recv_dict["d"] == test_dict["d"] assert recv_dict["d"] == test_dict["d"]
assert recv_dict["e"] == test_dict["e"] assert recv_dict["e"] == test_dict["e"]
assert torch.allclose(recv_dict["f"], test_dict["f"])
@pytest.mark.skipif(torch.cuda.device_count() < 2, @pytest.mark.skipif(torch.cuda.device_count() < 2,
reason="Need at least 2 GPUs to run the test.") reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize("tensor_parallel_size", [2]) @pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("test_target", [ @pytest.mark.parametrize("test_target", [
all_reduce_test_worker, all_gather_test_worker, all_reduce_test_worker, all_gather_test_worker,
broadcast_tensor_dict_test_worker broadcast_tensor_dict_test_worker
]) ])
def test_multi_process_tensor_parallel(tensor_parallel_size, test_target): def test_multi_process_tensor_parallel(tp_size, test_target):
multi_process_tensor_parallel(tensor_parallel_size, test_target) multi_process_tensor_parallel(tp_size, 1, test_target)
...@@ -6,10 +6,13 @@ import ray ...@@ -6,10 +6,13 @@ import ray
import torch import torch
import torch.distributed as dist import torch.distributed as dist
from vllm.distributed import tensor_model_parallel_all_reduce from vllm.distributed.communication_op import ( # noqa
from vllm.distributed.device_communicators import custom_all_reduce graph_capture, tensor_model_parallel_all_reduce)
from vllm.test_utils import (init_test_distributed_environment, from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
multi_process_tensor_parallel) get_tp_ca_communicator)
from ..utils import (init_test_distributed_environment,
multi_process_tensor_parallel)
random.seed(42) random.seed(42)
test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)] test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)]
...@@ -18,17 +21,36 @@ for i, v in enumerate(test_sizes): ...@@ -18,17 +21,36 @@ for i, v in enumerate(test_sizes):
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def graph_allreduce(world_size, rank, distributed_init_port): def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
del os.environ["CUDA_VISIBLE_DEVICES"] del os.environ["CUDA_VISIBLE_DEVICES"]
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(1, world_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
distributed_init_port) distributed_init_port)
custom_all_reduce.init_custom_all_reduce() group = get_tensor_model_parallel_group()
# A small all_reduce for warmup.
# this is needed because device communicators might be created lazily
# (e.g. NCCL). This will ensure that the communicator is initialized
# before any communication happens, so that this group can be used for
# graph capture immediately.
data = torch.zeros(1)
data = data.to(device=device)
torch.distributed.all_reduce(data, group=group)
torch.cuda.synchronize()
del data
# we use the first group to communicate once
# and the second group to communicate twice
# and so on
# this is used to demonstrate that each group can
# communicate independently
num_communication = rank // tp_size + 1
for sz in test_sizes: for sz in test_sizes:
for dtype in [torch.float32, torch.float16, torch.bfloat16]: for dtype in [torch.float32, torch.float16, torch.bfloat16]:
with custom_all_reduce.capture(): with graph_capture() as graph_capture_context:
# use integers so result matches NCCL exactly # use integers so result matches NCCL exactly
inp1 = torch.randint(1, inp1 = torch.randint(1,
16, (sz, ), 16, (sz, ),
...@@ -40,45 +62,54 @@ def graph_allreduce(world_size, rank, distributed_init_port): ...@@ -40,45 +62,54 @@ def graph_allreduce(world_size, rank, distributed_init_port):
device=torch.cuda.current_device()) device=torch.cuda.current_device())
torch.cuda.synchronize() torch.cuda.synchronize()
graph = torch.cuda.CUDAGraph() graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph): with torch.cuda.graph(graph,
out1 = tensor_model_parallel_all_reduce(inp1) stream=graph_capture_context.stream):
# the input buffer is immediately modified to test for i in range(num_communication):
# synchronization out1 = tensor_model_parallel_all_reduce(inp1)
dist.all_reduce(inp1) # the input buffer is immediately modified to test
out2 = tensor_model_parallel_all_reduce(inp2) # synchronization
dist.all_reduce(inp2) dist.all_reduce(inp1, group=group)
out2 = tensor_model_parallel_all_reduce(inp2)
dist.all_reduce(inp2, group=group)
graph.replay() graph.replay()
assert torch.allclose(out1, inp1) assert torch.allclose(out1, inp1)
assert torch.allclose(out2, inp2) assert torch.allclose(out2, inp2)
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def eager_allreduce(world_size, rank, distributed_init_port): def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
del os.environ["CUDA_VISIBLE_DEVICES"] del os.environ["CUDA_VISIBLE_DEVICES"]
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(1, world_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
distributed_init_port) distributed_init_port)
# we use the first group to communicate once
# and the second group to communicate twice
# and so on
# this is used to demonstrate that each group can
# communicate independently
num_communication = rank // tp_size + 1
sz = 1024 sz = 1024
custom_all_reduce.init_custom_all_reduce() fa = get_tp_ca_communicator()
fa = custom_all_reduce.get_handle()
inp = torch.ones(sz, dtype=torch.float32, device=device) inp = torch.ones(sz, dtype=torch.float32, device=device)
out = fa.all_reduce_unreg(inp) out = inp
assert torch.allclose(out, inp * world_size) for _ in range(num_communication):
out = fa.all_reduce_unreg(out)
assert torch.allclose(out, inp * (tp_size**num_communication))
inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device) inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
out = fa.all_reduce_unreg(inp) out = inp
assert torch.allclose(out, inp * world_size) for _ in range(num_communication):
out = fa.all_reduce_unreg(out)
assert torch.allclose(out, inp * (tp_size**num_communication))
@pytest.mark.skipif(torch.cuda.device_count() < 2, @pytest.mark.parametrize("tp_size", [2])
reason="Need at least 2 GPUs to run the test.") @pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
@pytest.mark.parametrize("tensor_parallel_size", [2])
@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce]) @pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
def test_multi_process_tensor_parallel(tensor_parallel_size, test_target): def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target):
multi_process_tensor_parallel(tensor_parallel_size, test_target) world_size = tp_size * pipeline_parallel_size
if world_size > torch.cuda.device_count():
pytest.skip("Not enough GPUs to run the test.")
if __name__ == "__main__": multi_process_tensor_parallel(tp_size, pipeline_parallel_size, test_target)
multi_process_tensor_parallel(2, graph_allreduce)
import multiprocessing import multiprocessing
import os
import pytest import pytest
import torch import torch
import torch.distributed
import vllm.distributed.device_communicators.pynccl_utils as pynccl_utils
from vllm.distributed.communication_op import tensor_model_parallel_all_reduce from vllm.distributed.communication_op import ( # noqa
from vllm.distributed.device_communicators.pynccl import (NCCLCommunicator, graph_capture, tensor_model_parallel_all_reduce)
ncclGetUniqueId) from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
from vllm.distributed.parallel_state import ( from vllm.distributed.device_communicators.pynccl_wrapper import NCCLLibrary
ensure_model_parallel_initialized, get_tensor_model_parallel_cpu_group, from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
init_distributed_environment, with_pynccl_for_all_reduce) init_distributed_environment)
from vllm.utils import update_environment_variables from vllm.utils import update_environment_variables
...@@ -41,6 +42,9 @@ def worker_fn_wrapper(fn): ...@@ -41,6 +42,9 @@ def worker_fn_wrapper(fn):
# and update the environment variables in the function # and update the environment variables in the function
def wrapped_fn(env): def wrapped_fn(env):
update_environment_variables(env) update_environment_variables(env)
local_rank = os.environ['LOCAL_RANK']
device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device)
init_distributed_environment() init_distributed_environment()
fn() fn()
...@@ -49,11 +53,13 @@ def worker_fn_wrapper(fn): ...@@ -49,11 +53,13 @@ def worker_fn_wrapper(fn):
@worker_fn_wrapper @worker_fn_wrapper
def worker_fn(): def worker_fn():
comm = NCCLCommunicator() pynccl_comm = PyNcclCommunicator()
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(comm.rank) tensor = torch.ones(16, 1024, 1024,
comm.all_reduce(tensor) dtype=torch.float32).cuda(pynccl_comm.rank)
with pynccl_comm.change_state(enable=True):
pynccl_comm.all_reduce(tensor)
result = tensor.mean().cpu().item() result = tensor.mean().cpu().item()
assert result == comm.world_size assert result == pynccl_comm.world_size
@pytest.mark.skipif(torch.cuda.device_count() < 2, @pytest.mark.skipif(torch.cuda.device_count() < 2,
...@@ -63,44 +69,42 @@ def test_pynccl(): ...@@ -63,44 +69,42 @@ def test_pynccl():
@worker_fn_wrapper @worker_fn_wrapper
def multiple_tp_worker_fn(): def multiple_allreduce_worker_fn():
device = torch.device(f"cuda:{torch.distributed.get_rank()}") device = torch.device(f"cuda:{torch.distributed.get_rank()}")
groups = [ groups = [
torch.distributed.new_group(ranks=[0, 1], backend="gloo"), torch.distributed.new_group(ranks=[0, 1], backend="gloo"),
torch.distributed.new_group(ranks=[2, 3], backend="gloo") torch.distributed.new_group(ranks=[2, 3], backend="gloo")
] ]
group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1] group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1]
comm = NCCLCommunicator(group=group, device=device) pynccl_comm = PyNcclCommunicator(group=group, device=device)
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device) tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
# two groups can communicate independently with pynccl_comm.change_state(enable=True):
if torch.distributed.get_rank() in [0, 1]: # two groups can communicate independently
comm.all_reduce(tensor) if torch.distributed.get_rank() in [0, 1]:
comm.all_reduce(tensor) pynccl_comm.all_reduce(tensor)
result = tensor.mean().cpu().item() pynccl_comm.all_reduce(tensor)
assert result == 4 result = tensor.mean().cpu().item()
else: assert result == 4
comm.all_reduce(tensor) else:
result = tensor.mean().cpu().item() pynccl_comm.all_reduce(tensor)
assert result == 2 result = tensor.mean().cpu().item()
assert result == 2
@pytest.mark.skipif(torch.cuda.device_count() < 4, @pytest.mark.skipif(torch.cuda.device_count() < 4,
reason="Need at least 4 GPUs to run the test.") reason="Need at least 4 GPUs to run the test.")
def test_pynccl_multiple_tp(): def test_pynccl_multiple_allreduce():
# this tests pynccl for multiple tp groups, in a standalone way # this tests pynccl for multiple tp groups, in a standalone way
# i.e. call `comm.all_reduce` directly # i.e. call `pynccl_comm.all_reduce` directly
distributed_run(multiple_tp_worker_fn, 4) distributed_run(multiple_allreduce_worker_fn, 4)
@worker_fn_wrapper @worker_fn_wrapper
def multiple_tp_with_vllm_worker_fn(): def multiple_allreduce_with_vllm_worker_fn():
device = torch.device(f"cuda:{torch.distributed.get_rank()}") device = torch.device(f"cuda:{torch.distributed.get_rank()}")
torch.cuda.set_device(torch.distributed.get_rank())
ensure_model_parallel_initialized(2, 2) ensure_model_parallel_initialized(2, 2)
pynccl_utils.init_process_group(
group=get_tensor_model_parallel_cpu_group())
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device) tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
with with_pynccl_for_all_reduce(): with graph_capture():
# two tp groups can communicate independently # two tp groups can communicate independently
if torch.distributed.get_rank() in [0, 1]: if torch.distributed.get_rank() in [0, 1]:
tensor = tensor_model_parallel_all_reduce(tensor) tensor = tensor_model_parallel_all_reduce(tensor)
...@@ -115,29 +119,31 @@ def multiple_tp_with_vllm_worker_fn(): ...@@ -115,29 +119,31 @@ def multiple_tp_with_vllm_worker_fn():
@pytest.mark.skipif(torch.cuda.device_count() < 4, @pytest.mark.skipif(torch.cuda.device_count() < 4,
reason="Need at least 4 GPUs to run the test.") reason="Need at least 4 GPUs to run the test.")
def test_pynccl_multiple_tp_with_vllm(): def test_pynccl_multiple_allreduce_with_vllm():
# this tests pynccl for multiple tp groups, together with vllm # this tests pynccl for multiple tp groups, together with vllm
# i.e. call `tensor_model_parallel_all_reduce` # i.e. call `tensor_model_parallel_all_reduce`
distributed_run(multiple_tp_with_vllm_worker_fn, 4) distributed_run(multiple_allreduce_with_vllm_worker_fn, 4)
@worker_fn_wrapper @worker_fn_wrapper
def worker_fn_with_cudagraph(): def worker_fn_with_cudagraph():
with torch.no_grad(): with torch.no_grad():
graph = torch.cuda.CUDAGraph() graph = torch.cuda.CUDAGraph()
comm = NCCLCommunicator() pynccl_comm = PyNcclCommunicator()
# run something in the default stream to initialize torch engine # run something in the default stream to initialize torch engine
a = torch.ones((4, 4), device=f'cuda:{comm.rank}') a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}')
torch.cuda.synchronize() torch.cuda.synchronize()
with torch.cuda.graph(graph, stream=comm.stream): with torch.cuda.graph(
graph, stream=pynccl_comm.stream), pynccl_comm.change_state(
enable=True):
# operation during the graph capture is recorded but not executed # operation during the graph capture is recorded but not executed
# see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#creating-a-graph-using-stream-capture # noqa # see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#creating-a-graph-using-stream-capture # noqa
comm.all_reduce(a) pynccl_comm.all_reduce(a)
comm.stream.synchronize() pynccl_comm.stream.synchronize()
assert a.mean().cpu().item() == comm.world_size**0 assert a.mean().cpu().item() == pynccl_comm.world_size**0
graph.replay() graph.replay()
comm.stream.synchronize() pynccl_comm.stream.synchronize()
assert a.mean().cpu().item() == comm.world_size**1 assert a.mean().cpu().item() == pynccl_comm.world_size**1
@pytest.mark.skipif(torch.cuda.device_count() < 2, @pytest.mark.skipif(torch.cuda.device_count() < 2,
...@@ -146,8 +152,71 @@ def test_pynccl_with_cudagraph(): ...@@ -146,8 +152,71 @@ def test_pynccl_with_cudagraph():
distributed_run(worker_fn_with_cudagraph, 2) distributed_run(worker_fn_with_cudagraph, 2)
@worker_fn_wrapper
def send_recv_worker_fn():
pynccl_comm = PyNcclCommunicator()
if pynccl_comm.rank == 0:
tensor = torch.ones(16, 1024, 1024,
dtype=torch.float32).cuda(pynccl_comm.rank)
else:
tensor = torch.empty(16, 1024, 1024,
dtype=torch.float32).cuda(pynccl_comm.rank)
with pynccl_comm.change_state(enable=True):
if pynccl_comm.rank == 0:
pynccl_comm.send(tensor)
else:
pynccl_comm.recv(tensor)
result = tensor.mean().cpu().item()
assert result == 1
@pytest.mark.skipif(torch.cuda.device_count() < 2,
reason="Need at least 2 GPUs to run the test.")
def test_pynccl_send_recv():
distributed_run(send_recv_worker_fn, 2)
@worker_fn_wrapper
def multiple_send_recv_worker_fn():
device = torch.device(f"cuda:{torch.distributed.get_rank()}")
groups = [
torch.distributed.new_group(ranks=[0, 2], backend="gloo"),
torch.distributed.new_group(ranks=[1, 3], backend="gloo")
]
group = groups[0] if torch.distributed.get_rank() in [0, 2] else groups[1]
pynccl_comm = PyNcclCommunicator(group=group, device=device)
if torch.distributed.get_rank() == 0:
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
elif torch.distributed.get_rank() == 1:
tensor = 2 * torch.ones(
16, 1024, 1024, dtype=torch.float32, device=device)
else:
tensor = torch.empty(16,
1024,
1024,
dtype=torch.float32,
device=device)
with pynccl_comm.change_state(enable=True):
if torch.distributed.get_rank() in [0, 1]:
pynccl_comm.send(tensor)
else:
pynccl_comm.recv(tensor)
result = tensor.mean().cpu().item()
if torch.distributed.get_rank() in [0, 2]:
assert result == 1
else:
assert result == 2
@pytest.mark.skipif(torch.cuda.device_count() < 4,
reason="Need at least 4 GPUs to run the test.")
def test_pynccl_multiple_send_recv():
distributed_run(multiple_send_recv_worker_fn, 4)
def test_ncclGetUniqueId(): def test_ncclGetUniqueId():
unique_id = ncclGetUniqueId() lib = NCCLLibrary()
unique_id = lib.ncclGetUniqueId()
# `list(unique_id.internal)` is something like this: # `list(unique_id.internal)` is something like this:
# [34, -16, 23, 83, 109, -19, 59, 95, 2, 0, -86, 55, 10, -128, 0, 29, 0, # [34, -16, 23, 83, 109, -19, 59, 95, 2, 0, -86, 55, 10, -128, 0, 29, 0,
# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
......
import multiprocessing
import tempfile
def target_fn(env, filepath):
from vllm.utils import update_environment_variables
update_environment_variables(env)
from vllm.utils import nccl_integrity_check
nccl_integrity_check(filepath)
def test_library_file():
# note: don't import vllm.distributed.device_communicators.pynccl
# before running this test, otherwise the library file will be loaded
# and it might interfere with the test
from vllm.utils import find_nccl_library
so_file = find_nccl_library()
with open(so_file, 'rb') as f:
content = f.read()
try:
# corrupt the library file, should raise an exception
with open(so_file, 'wb') as f:
f.write(content[:len(content) // 2])
p = multiprocessing.Process(target=target_fn, args=({}, so_file))
p.start()
p.join()
assert p.exitcode != 0
# move the library file to a tmp path
# test VLLM_NCCL_SO_PATH
fd, path = tempfile.mkstemp()
with open(path, 'wb') as f:
f.write(content)
p = multiprocessing.Process(target=target_fn,
args=({
"VLLM_NCCL_SO_PATH": path
}, path))
p.start()
p.join()
assert p.exitcode == 0
finally:
with open(so_file, 'wb') as f:
f.write(content)
...@@ -4,16 +4,17 @@ from unittest.mock import MagicMock ...@@ -4,16 +4,17 @@ from unittest.mock import MagicMock
import pytest import pytest
from transformers import PreTrainedTokenizer from transformers import PreTrainedTokenizer
from tests.core.utils import create_seq_group
from vllm.core.scheduler import Scheduler from vllm.core.scheduler import Scheduler
from vllm.engine.output_processor.multi_step import MultiStepOutputProcessor from vllm.engine.output_processor.multi_step import MultiStepOutputProcessor
from vllm.engine.output_processor.stop_checker import StopChecker from vllm.engine.output_processor.stop_checker import StopChecker
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.sequence import (Logprob, SequenceGroupOutput, SequenceOutput, from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
SequenceStatus) SequenceOutput, SequenceStatus)
from vllm.transformers_utils.detokenizer import Detokenizer from vllm.transformers_utils.detokenizer import Detokenizer
from vllm.utils import Counter from vllm.utils import Counter
from ...core.utils import create_seq_group
@pytest.mark.parametrize("seq_output_len", [128]) @pytest.mark.parametrize("seq_output_len", [128])
@pytest.mark.parametrize("num_new_tokens", [1, 12]) @pytest.mark.parametrize("num_new_tokens", [1, 12])
...@@ -51,7 +52,7 @@ def test_appends_token_ids(num_new_tokens: int, seq_output_len: int): ...@@ -51,7 +52,7 @@ def test_appends_token_ids(num_new_tokens: int, seq_output_len: int):
new_token_ids = list(range(num_new_tokens)) new_token_ids = list(range(num_new_tokens))
outputs = [ outputs = [
SequenceGroupOutput( CompletionSequenceGroupOutput(
samples=[ samples=[
SequenceOutput( SequenceOutput(
parent_seq_id=seq.seq_id, parent_seq_id=seq.seq_id,
...@@ -103,7 +104,7 @@ def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int, ...@@ -103,7 +104,7 @@ def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,
new_token_ids = list(range(num_new_tokens)) new_token_ids = list(range(num_new_tokens))
outputs = [ outputs = [
SequenceGroupOutput( CompletionSequenceGroupOutput(
samples=[ samples=[
SequenceOutput( SequenceOutput(
parent_seq_id=seq.seq_id, parent_seq_id=seq.seq_id,
...@@ -170,7 +171,7 @@ def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int, ...@@ -170,7 +171,7 @@ def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
new_token_ids[eos_index] = eos_token_id new_token_ids[eos_index] = eos_token_id
outputs = [ outputs = [
SequenceGroupOutput( CompletionSequenceGroupOutput(
samples=[ samples=[
SequenceOutput( SequenceOutput(
parent_seq_id=seq.seq_id, parent_seq_id=seq.seq_id,
...@@ -239,7 +240,7 @@ def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int, ...@@ -239,7 +240,7 @@ def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
new_token_ids[eos_index] = eos_token_id new_token_ids[eos_index] = eos_token_id
outputs = [ outputs = [
SequenceGroupOutput( CompletionSequenceGroupOutput(
samples=[ samples=[
SequenceOutput( SequenceOutput(
parent_seq_id=seq.seq_id, parent_seq_id=seq.seq_id,
......
from unittest.mock import MagicMock
import pytest
from transformers import PreTrainedTokenizer
from vllm.engine.output_processor.stop_checker import StopChecker
from vllm.sampling_params import SamplingParams
from vllm.sequence import Logprob, Sequence, SequenceStatus
def sequence_with_eos(text: str, eos_token: str,
eos_token_id: int) -> Sequence:
"""
Create a Sequence that ends with an EOS token.
"""
seq = Sequence(
seq_id=0,
inputs={"prompt_token_ids": []},
block_size=16,
eos_token_id=eos_token_id,
)
seq.output_text = text + eos_token
offset = eos_token_id + 1
for i in range(offset, len(text) + offset):
seq.append_token_id(token_id=i, logprobs={i: Logprob(0.0)})
seq.append_token_id(token_id=eos_token_id,
logprobs={eos_token_id: Logprob(0.0)})
seq.status = SequenceStatus.RUNNING
return seq
@pytest.mark.parametrize(["text_wo_eos", "eos_token", "eos_token_id"], [
("This text ends with EOS token", "</s>", 2),
])
@pytest.mark.parametrize("ignore_eos", [True, False, None])
@pytest.mark.parametrize("include_stop_str_in_output", [True, False, None])
@pytest.mark.skip_global_cleanup
def test_stop_on_eos_token(text_wo_eos: str, eos_token: str, eos_token_id: int,
ignore_eos: bool, include_stop_str_in_output: bool):
"""
Test the behavior of the StopChecker's maybe_stop_sequence method
when an EOS token is encountered.
This test covers:
- When the EOS token should stop the sequence and be removed from the output
- When the EOS token should stop the sequence and be included in the output
- When the EOS token should be ignored, and the sequence continues
"""
tokenizer = MagicMock(spec=PreTrainedTokenizer)
get_tokenizer_for_seq = MagicMock(return_value=tokenizer)
stop_checker = StopChecker(max_model_len=1024,
get_tokenizer_for_seq=get_tokenizer_for_seq)
seq = sequence_with_eos(
text=text_wo_eos,
eos_token=eos_token,
eos_token_id=eos_token_id,
)
new_char_count = len(eos_token)
# Note that `stop` and `stop_token_ids` are not specified
sampling_params = SamplingParams(
min_tokens=1,
ignore_eos=ignore_eos,
include_stop_str_in_output=include_stop_str_in_output)
stop_checker.maybe_stop_sequence(
seq=seq,
new_char_count=new_char_count,
sampling_params=sampling_params,
)
if ignore_eos:
assert seq.status == SequenceStatus.RUNNING
assert seq.output_text == text_wo_eos + eos_token
elif include_stop_str_in_output:
assert seq.status == SequenceStatus.FINISHED_STOPPED
assert seq.output_text == text_wo_eos + eos_token
else:
assert seq.status == SequenceStatus.FINISHED_STOPPED
assert seq.output_text == text_wo_eos
...@@ -14,7 +14,7 @@ def test_skip_tokenizer_initialization(model: str): ...@@ -14,7 +14,7 @@ def test_skip_tokenizer_initialization(model: str):
with pytest.raises(ValueError) as err: with pytest.raises(ValueError) as err:
llm.generate("abc", sampling_params) llm.generate("abc", sampling_params)
assert "prompts must be None if" in str(err.value) assert "prompts must be None if" in str(err.value)
outputs = llm.generate(prompt_token_ids=[[1, 2, 3]], outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
sampling_params=sampling_params) sampling_params=sampling_params)
assert len(outputs) > 0 assert len(outputs) > 0
completions = outputs[0].outputs completions = outputs[0].outputs
......
...@@ -32,6 +32,7 @@ def test_stop_reason(vllm_model, example_prompts): ...@@ -32,6 +32,7 @@ def test_stop_reason(vllm_model, example_prompts):
# test stop token # test stop token
outputs = llm.generate(example_prompts, outputs = llm.generate(example_prompts,
sampling_params=SamplingParams( sampling_params=SamplingParams(
ignore_eos=True,
seed=SEED, seed=SEED,
max_tokens=MAX_TOKENS, max_tokens=MAX_TOKENS,
stop_token_ids=[stop_token_id])) stop_token_ids=[stop_token_id]))
...@@ -43,7 +44,10 @@ def test_stop_reason(vllm_model, example_prompts): ...@@ -43,7 +44,10 @@ def test_stop_reason(vllm_model, example_prompts):
# test stop string # test stop string
outputs = llm.generate(example_prompts, outputs = llm.generate(example_prompts,
sampling_params=SamplingParams( sampling_params=SamplingParams(
seed=SEED, max_tokens=MAX_TOKENS, stop=".")) ignore_eos=True,
seed=SEED,
max_tokens=MAX_TOKENS,
stop="."))
for output in outputs: for output in outputs:
output = output.outputs[0] output = output.outputs[0]
assert output.finish_reason == "stop" assert output.finish_reason == "stop"
......
import asyncio import asyncio
from dataclasses import dataclass from dataclasses import dataclass
import pytest
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
MODEL_NAME = "openai-community/gpt2" MODEL_NAME = "openai-community/gpt2"
CHAT_TEMPLATE = "Dummy chat template for testing {}" CHAT_TEMPLATE = "Dummy chat template for testing {}"
pytestmark = pytest.mark.openai
@dataclass @dataclass
class MockModelConfig: class MockModelConfig:
...@@ -14,17 +18,22 @@ class MockModelConfig: ...@@ -14,17 +18,22 @@ class MockModelConfig:
tokenizer_mode = "auto" tokenizer_mode = "auto"
max_model_len = 100 max_model_len = 100
tokenizer_revision = None tokenizer_revision = None
embedding_mode = False
@dataclass @dataclass
class MockEngine: class MockEngine:
async def get_model_config(self): async def get_model_config(self):
return MockModelConfig return MockModelConfig()
async def _async_serving_chat_init(): async def _async_serving_chat_init():
serving_completion = OpenAIServingChat(MockEngine(), engine = MockEngine()
model_config = await engine.get_model_config()
serving_completion = OpenAIServingChat(engine,
model_config,
served_model_names=[MODEL_NAME], served_model_names=[MODEL_NAME],
response_role="assistant", response_role="assistant",
chat_template=CHAT_TEMPLATE) chat_template=CHAT_TEMPLATE)
......
...@@ -52,6 +52,8 @@ TEST_SCHEMA = { ...@@ -52,6 +52,8 @@ TEST_SCHEMA = {
TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)") r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
pytestmark = pytest.mark.openai
def test_guided_logits_processors(): def test_guided_logits_processors():
"""Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor.""" """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
......
import weakref
from typing import List
import pytest
from vllm import LLM, EmbeddingRequestOutput, PoolingParams
from ..conftest import cleanup
MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
PROMPTS = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
TOKEN_IDS = [
# Using ID={0, 1, 2, 3} results in NaN values,
# so we add this offset of 1000
[1000],
[1000, 1001],
[1000, 1002, 1001],
[1000, 1003, 1001, 1002],
]
pytestmark = pytest.mark.llm
@pytest.fixture(scope="module")
def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(model=MODEL_NAME,
max_num_batched_tokens=32768,
tensor_parallel_size=1,
gpu_memory_utilization=0.75,
enforce_eager=True)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
del llm
cleanup()
def assert_outputs_equal(o1: List[EmbeddingRequestOutput],
o2: List[EmbeddingRequestOutput]):
assert [o.outputs for o in o1] == [o.outputs for o in o2]
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize('prompt', PROMPTS)
def test_v1_v2_api_consistency_single_prompt_string(llm: LLM, prompt):
pooling_params = PoolingParams()
with pytest.warns(DeprecationWarning, match="'prompts'"):
v1_output = llm.encode(prompts=prompt, pooling_params=pooling_params)
v2_output = llm.encode(prompt, pooling_params=pooling_params)
assert_outputs_equal(v1_output, v2_output)
v2_output = llm.encode({"prompt": prompt}, pooling_params=pooling_params)
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
prompt_token_ids):
pooling_params = PoolingParams()
with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
v1_output = llm.encode(prompt_token_ids=prompt_token_ids,
pooling_params=pooling_params)
v2_output = llm.encode({"prompt_token_ids": prompt_token_ids},
pooling_params=pooling_params)
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
def test_v1_v2_api_consistency_multi_prompt_string(llm: LLM):
pooling_params = PoolingParams()
with pytest.warns(DeprecationWarning, match="'prompts'"):
v1_output = llm.encode(prompts=PROMPTS, pooling_params=pooling_params)
v2_output = llm.encode(PROMPTS, pooling_params=pooling_params)
assert_outputs_equal(v1_output, v2_output)
v2_output = llm.encode(
[{
"prompt": p
} for p in PROMPTS],
pooling_params=pooling_params,
)
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
pooling_params = PoolingParams()
with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
v1_output = llm.encode(prompt_token_ids=TOKEN_IDS,
pooling_params=pooling_params)
v2_output = llm.encode(
[{
"prompt_token_ids": p
} for p in TOKEN_IDS],
pooling_params=pooling_params,
)
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
def test_multiple_pooling_params(llm: LLM):
pooling_params = [
PoolingParams(),
PoolingParams(),
PoolingParams(),
PoolingParams(),
]
# Multiple PoolingParams should be matched with each prompt
outputs = llm.encode(PROMPTS, pooling_params=pooling_params)
assert len(PROMPTS) == len(outputs)
# Exception raised, if the size of params does not match the size of prompts
with pytest.raises(ValueError):
outputs = llm.encode(PROMPTS, pooling_params=pooling_params[:3])
# Single PoolingParams should be applied to every prompt
single_pooling_params = PoolingParams()
outputs = llm.encode(PROMPTS, pooling_params=single_pooling_params)
assert len(PROMPTS) == len(outputs)
# pooling_params is None, default params should be applied
outputs = llm.encode(PROMPTS, pooling_params=None)
assert len(PROMPTS) == len(outputs)
import weakref
from typing import List
import pytest import pytest
from vllm import LLM, SamplingParams from vllm import LLM, RequestOutput, SamplingParams
from ..conftest import cleanup
MODEL_NAME = "facebook/opt-125m"
PROMPTS = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
TOKEN_IDS = [
[0],
[0, 1],
[0, 2, 1],
[0, 3, 1, 2],
]
def test_multiple_sampling_params(): pytestmark = pytest.mark.llm
llm = LLM(model="facebook/opt-125m",
@pytest.fixture(scope="module")
def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(model=MODEL_NAME,
max_num_batched_tokens=4096, max_num_batched_tokens=4096,
tensor_parallel_size=1) tensor_parallel_size=1,
gpu_memory_utilization=0.10,
enforce_eager=True)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
del llm
cleanup()
def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]):
assert [o.outputs for o in o1] == [o.outputs for o in o2]
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize('prompt', PROMPTS)
def test_v1_v2_api_consistency_single_prompt_string(llm: LLM, prompt):
sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
with pytest.warns(DeprecationWarning, match="'prompts'"):
v1_output = llm.generate(prompts=prompt,
sampling_params=sampling_params)
v2_output = llm.generate(prompt, sampling_params=sampling_params)
assert_outputs_equal(v1_output, v2_output)
v2_output = llm.generate({"prompt": prompt},
sampling_params=sampling_params)
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
prompt_token_ids):
sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
v1_output = llm.generate(prompt_token_ids=prompt_token_ids,
sampling_params=sampling_params)
v2_output = llm.generate({"prompt_token_ids": prompt_token_ids},
sampling_params=sampling_params)
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
def test_v1_v2_api_consistency_multi_prompt_string(llm: LLM):
sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
with pytest.warns(DeprecationWarning, match="'prompts'"):
v1_output = llm.generate(prompts=PROMPTS,
sampling_params=sampling_params)
v2_output = llm.generate(PROMPTS, sampling_params=sampling_params)
assert_outputs_equal(v1_output, v2_output)
v2_output = llm.generate(
[{
"prompt": p
} for p in PROMPTS],
sampling_params=sampling_params,
)
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
v1_output = llm.generate(prompt_token_ids=TOKEN_IDS,
sampling_params=sampling_params)
v2_output = llm.generate(
[{
"prompt_token_ids": p
} for p in TOKEN_IDS],
sampling_params=sampling_params,
)
assert_outputs_equal(v1_output, v2_output)
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
@pytest.mark.skip_global_cleanup
def test_multiple_sampling_params(llm: LLM):
sampling_params = [ sampling_params = [
SamplingParams(temperature=0.01, top_p=0.95), SamplingParams(temperature=0.01, top_p=0.95),
SamplingParams(temperature=0.3, top_p=0.95), SamplingParams(temperature=0.3, top_p=0.95),
...@@ -24,18 +127,18 @@ def test_multiple_sampling_params(): ...@@ -24,18 +127,18 @@ def test_multiple_sampling_params():
] ]
# Multiple SamplingParams should be matched with each prompt # Multiple SamplingParams should be matched with each prompt
outputs = llm.generate(prompts, sampling_params=sampling_params) outputs = llm.generate(PROMPTS, sampling_params=sampling_params)
assert len(prompts) == len(outputs) assert len(PROMPTS) == len(outputs)
# Exception raised, if the size of params does not match the size of prompts # Exception raised, if the size of params does not match the size of prompts
with pytest.raises(ValueError): with pytest.raises(ValueError):
outputs = llm.generate(prompts, sampling_params=sampling_params[:3]) outputs = llm.generate(PROMPTS, sampling_params=sampling_params[:3])
# Single SamplingParams should be applied to every prompt # Single SamplingParams should be applied to every prompt
single_sampling_params = SamplingParams(temperature=0.3, top_p=0.95) single_sampling_params = SamplingParams(temperature=0.3, top_p=0.95)
outputs = llm.generate(prompts, sampling_params=single_sampling_params) outputs = llm.generate(PROMPTS, sampling_params=single_sampling_params)
assert len(prompts) == len(outputs) assert len(PROMPTS) == len(outputs)
# sampling_params is None, default params should be applied # sampling_params is None, default params should be applied
outputs = llm.generate(prompts, sampling_params=None) outputs = llm.generate(PROMPTS, sampling_params=None)
assert len(prompts) == len(outputs) assert len(PROMPTS) == len(outputs)
\ No newline at end of file
import subprocess
import sys
import tempfile
from vllm.entrypoints.openai.protocol import BatchRequestOutput
# ruff: noqa: E501
INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
def test_e2e():
with tempfile.NamedTemporaryFile(
"w") as input_file, tempfile.NamedTemporaryFile(
"r") as output_file:
input_file.write(INPUT_BATCH)
input_file.flush()
proc = subprocess.Popen([
sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
input_file.name, "-o", output_file.name, "--model",
"NousResearch/Meta-Llama-3-8B-Instruct"
], )
proc.communicate()
proc.wait()
assert proc.returncode == 0, f"{proc=}"
contents = output_file.read()
for line in contents.strip().split("\n"):
# Ensure that the output format conforms to the openai api.
# Validation should throw if the schema is wrong.
BatchRequestOutput.model_validate_json(line)
def test_e2e_invalid_input():
"""
Ensure that we fail when the input doesn't conform to the openai api.
"""
with tempfile.NamedTemporaryFile(
"w") as input_file, tempfile.NamedTemporaryFile(
"r") as output_file:
input_file.write(INVALID_INPUT_BATCH)
input_file.flush()
proc = subprocess.Popen([
sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
input_file.name, "-o", output_file.name, "--model",
"NousResearch/Meta-Llama-3-8B-Instruct"
], )
proc.communicate()
proc.wait()
assert proc.returncode != 0, f"{proc=}"
# imports for guided decoding tests # imports for guided decoding tests
import json import json
import os
import re import re
import subprocess
import sys
import time
import jsonschema import jsonschema
import openai # use the official client for correctness check import openai # use the official client for correctness check
...@@ -12,7 +8,6 @@ import pytest ...@@ -12,7 +8,6 @@ import pytest
# using Ray for overall ease of process management, parallel requests, # using Ray for overall ease of process management, parallel requests,
# and debugging. # and debugging.
import ray import ray
import requests
import torch import torch
# downloading lora to test lora requests # downloading lora to test lora requests
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
...@@ -20,9 +15,11 @@ from openai import BadRequestError ...@@ -20,9 +15,11 @@ from openai import BadRequestError
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds from ..utils import ServerRunner
# any model with a chat template should work here # any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
# technically this needs Mistral-7B-v0.1 as base, but we're not testing # technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here # generation quality here
LORA_NAME = "typeof/zephyr-7b-beta-lora" LORA_NAME = "typeof/zephyr-7b-beta-lora"
...@@ -74,46 +71,7 @@ TEST_CHOICE = [ ...@@ -74,46 +71,7 @@ TEST_CHOICE = [
"Swift", "Kotlin" "Swift", "Kotlin"
] ]
pytestmark = pytest.mark.asyncio pytestmark = pytest.mark.openai
@ray.remote(num_gpus=1)
class ServerRunner:
def __init__(self, args):
env = os.environ.copy()
env["PYTHONUNBUFFERED"] = "1"
self.proc = subprocess.Popen(
["python3", "-m", "vllm.entrypoints.openai.api_server"] + args,
env=env,
stdout=sys.stdout,
stderr=sys.stderr,
)
self._wait_for_server()
def ready(self):
return True
def _wait_for_server(self):
# run health check
start = time.time()
while True:
try:
if requests.get(
"http://localhost:8000/health").status_code == 200:
break
except Exception as err:
if self.proc.poll() is not None:
raise RuntimeError("Server exited unexpectedly.") from err
time.sleep(0.5)
if time.time() - start > MAX_SERVER_START_WAIT_S:
raise RuntimeError(
"Server failed to start in time.") from err
def __del__(self):
if hasattr(self, "proc"):
self.proc.terminate()
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
...@@ -121,7 +79,7 @@ def zephyr_lora_files(): ...@@ -121,7 +79,7 @@ def zephyr_lora_files():
return snapshot_download(repo_id=LORA_NAME) return snapshot_download(repo_id=LORA_NAME)
@pytest.fixture(scope="session") @pytest.fixture(scope="module")
def server(zephyr_lora_files): def server(zephyr_lora_files):
ray.init() ray.init()
server_runner = ServerRunner.remote([ server_runner = ServerRunner.remote([
...@@ -133,6 +91,8 @@ def server(zephyr_lora_files): ...@@ -133,6 +91,8 @@ def server(zephyr_lora_files):
"--max-model-len", "--max-model-len",
"8192", "8192",
"--enforce-eager", "--enforce-eager",
"--gpu-memory-utilization",
"0.75",
# lora config below # lora config below
"--enable-lora", "--enable-lora",
"--lora-modules", "--lora-modules",
...@@ -150,6 +110,27 @@ def server(zephyr_lora_files): ...@@ -150,6 +110,27 @@ def server(zephyr_lora_files):
ray.shutdown() ray.shutdown()
@pytest.fixture(scope="module")
def embedding_server(zephyr_lora_files):
ray.shutdown()
ray.init()
server_runner = ServerRunner.remote([
"--model",
EMBEDDING_MODEL_NAME,
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--enforce-eager",
"--gpu-memory-utilization",
"0.75",
"--max-model-len",
"8192",
])
ray.get(server_runner.ready.remote())
yield server_runner
ray.shutdown()
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def client(): def client():
client = openai.AsyncOpenAI( client = openai.AsyncOpenAI(
...@@ -159,6 +140,7 @@ def client(): ...@@ -159,6 +140,7 @@ def client():
yield client yield client
@pytest.mark.asyncio
async def test_check_models(server, client: openai.AsyncOpenAI): async def test_check_models(server, client: openai.AsyncOpenAI):
models = await client.models.list() models = await client.models.list()
models = models.data models = models.data
...@@ -170,6 +152,7 @@ async def test_check_models(server, client: openai.AsyncOpenAI): ...@@ -170,6 +152,7 @@ async def test_check_models(server, client: openai.AsyncOpenAI):
assert lora_models[1].id == "zephyr-lora2" assert lora_models[1].id == "zephyr-lora2"
@pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
# first test base model, then test loras # first test base model, then test loras
"model_name", "model_name",
...@@ -201,6 +184,27 @@ async def test_single_completion(server, client: openai.AsyncOpenAI, ...@@ -201,6 +184,27 @@ async def test_single_completion(server, client: openai.AsyncOpenAI,
completion.choices[0].text) >= 5 completion.choices[0].text) >= 5
@pytest.mark.asyncio
@pytest.mark.parametrize(
# first test base model, then test loras
"model_name",
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
)
async def test_no_logprobs(server, client: openai.AsyncOpenAI,
model_name: str):
# test using token IDs
completion = await client.completions.create(
model=MODEL_NAME,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
logprobs=None,
)
choice = completion.choices[0]
assert choice.logprobs is None
@pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
# first test base model, then test loras # first test base model, then test loras
"model_name", "model_name",
...@@ -219,9 +223,75 @@ async def test_zero_logprobs(server, client: openai.AsyncOpenAI, ...@@ -219,9 +223,75 @@ async def test_zero_logprobs(server, client: openai.AsyncOpenAI,
choice = completion.choices[0] choice = completion.choices[0]
assert choice.logprobs is not None assert choice.logprobs is not None
assert choice.logprobs.token_logprobs is not None assert choice.logprobs.token_logprobs is not None
assert choice.logprobs.top_logprobs is None assert choice.logprobs.top_logprobs is not None
assert len(choice.logprobs.top_logprobs[0]) <= 1
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME, "zephyr-lora"],
)
async def test_some_logprobs(server, client: openai.AsyncOpenAI,
model_name: str):
# test using token IDs
completion = await client.completions.create(
model=MODEL_NAME,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
logprobs=5,
)
choice = completion.choices[0]
assert choice.logprobs is not None
assert choice.logprobs.token_logprobs is not None
assert choice.logprobs.top_logprobs is not None
assert len(choice.logprobs.top_logprobs[0]) <= 6
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME, "zephyr-lora"],
)
async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
model_name: str):
with pytest.raises(
(openai.BadRequestError, openai.APIError)): # test using token IDs
await client.completions.create(
model=MODEL_NAME,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
logprobs=6,
)
...
with pytest.raises(
(openai.BadRequestError, openai.APIError)): # test using token IDs
stream = await client.completions.create(
model=MODEL_NAME,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
logprobs=6,
stream=True,
)
async for chunk in stream:
...
# the server should still work afterwards
completion = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
)
completion = completion.choices[0].text
assert completion is not None and len(completion) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
# just test 1 lora hereafter # just test 1 lora hereafter
"model_name", "model_name",
...@@ -248,8 +318,10 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI, ...@@ -248,8 +318,10 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI,
chat_completion.choices) == 1 chat_completion.choices) == 1
assert chat_completion.choices[0].message is not None assert chat_completion.choices[0].message is not None
assert chat_completion.choices[0].logprobs is not None assert chat_completion.choices[0].logprobs is not None
assert chat_completion.choices[0].logprobs.top_logprobs is not None assert chat_completion.choices[0].logprobs.content[
assert len(chat_completion.choices[0].logprobs.top_logprobs[0]) == 5 0].top_logprobs is not None
assert len(
chat_completion.choices[0].logprobs.content[0].top_logprobs) == 5
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 10 assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant" assert message.role == "assistant"
...@@ -266,9 +338,93 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI, ...@@ -266,9 +338,93 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI,
assert message.content is not None and len(message.content) >= 0 assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize(
# first test base model, then test loras
"model_name",
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
)
async def test_no_logprobs_chat(server, client: openai.AsyncOpenAI,
model_name: str):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
}, {
"role": "user",
"content": "what is 1+1?"
}]
chat_completion = await client.chat.completions.create(model=model_name,
messages=messages,
max_tokens=5,
temperature=0.0,
logprobs=False)
choice = chat_completion.choices[0]
assert choice.logprobs is None
@pytest.mark.asyncio
@pytest.mark.parametrize(
# just test 1 lora hereafter
"model_name",
[MODEL_NAME, "zephyr-lora"],
)
async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI,
model_name: str):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
}, {
"role": "user",
"content": "what is 1+1?"
}]
chat_completion = await client.chat.completions.create(model=model_name,
messages=messages,
max_tokens=5,
temperature=0.0,
logprobs=True,
top_logprobs=0)
choice = chat_completion.choices[0]
assert choice.logprobs is not None
assert choice.logprobs.content is not None
assert len(choice.logprobs.content[0].top_logprobs) <= 1
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME, "zephyr-lora"],
)
async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI,
model_name: str):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
}, {
"role": "user",
"content": "what is 1+1?"
}]
chat_completion = await client.chat.completions.create(model=model_name,
messages=messages,
max_tokens=5,
temperature=0.0,
logprobs=True,
top_logprobs=5)
choice = chat_completion.choices[0]
assert choice.logprobs is not None
assert choice.logprobs.content is not None
assert len(choice.logprobs.content[0].top_logprobs) <= 6
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_too_many_logprobs(server, client: openai.AsyncOpenAI, async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI,
model_name: str): model_name: str):
messages = [{ messages = [{
"role": "system", "role": "system",
"content": "you are a helpful assistant" "content": "you are a helpful assistant"
...@@ -277,13 +433,13 @@ async def test_too_many_logprobs(server, client: openai.AsyncOpenAI, ...@@ -277,13 +433,13 @@ async def test_too_many_logprobs(server, client: openai.AsyncOpenAI,
"content": "what is 1+1?" "content": "what is 1+1?"
}] }]
# Default max_logprobs is 5, so this should raise an error # Default max_logprobs is 20, so this should raise an error
with pytest.raises((openai.BadRequestError, openai.APIError)): with pytest.raises((openai.BadRequestError, openai.APIError)):
stream = await client.chat.completions.create(model=model_name, stream = await client.chat.completions.create(model=model_name,
messages=messages, messages=messages,
max_tokens=10, max_tokens=10,
logprobs=True, logprobs=True,
top_logprobs=10, top_logprobs=21,
stream=True) stream=True)
async for chunk in stream: async for chunk in stream:
... ...
...@@ -293,25 +449,9 @@ async def test_too_many_logprobs(server, client: openai.AsyncOpenAI, ...@@ -293,25 +449,9 @@ async def test_too_many_logprobs(server, client: openai.AsyncOpenAI,
messages=messages, messages=messages,
max_tokens=10, max_tokens=10,
logprobs=True, logprobs=True,
top_logprobs=10, top_logprobs=30,
stream=False) stream=False)
with pytest.raises((openai.BadRequestError, openai.APIError)):
stream = await client.completions.create(model=model_name,
prompt="Test",
max_tokens=10,
logprobs=10,
stream=True)
async for chunk in stream:
...
with pytest.raises(openai.BadRequestError):
await client.completions.create(model=model_name,
prompt="Test",
max_tokens=10,
logprobs=10,
stream=False)
# the server should still work afterwards # the server should still work afterwards
chat_completion = await client.chat.completions.create(model=model_name, chat_completion = await client.chat.completions.create(model=model_name,
messages=messages, messages=messages,
...@@ -321,6 +461,7 @@ async def test_too_many_logprobs(server, client: openai.AsyncOpenAI, ...@@ -321,6 +461,7 @@ async def test_too_many_logprobs(server, client: openai.AsyncOpenAI,
assert message.content is not None and len(message.content) >= 0 assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
# just test 1 lora hereafter # just test 1 lora hereafter
"model_name", "model_name",
...@@ -358,6 +499,7 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI, ...@@ -358,6 +499,7 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
assert "".join(chunks) == single_output assert "".join(chunks) == single_output
@pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
# just test 1 lora hereafter # just test 1 lora hereafter
"model_name", "model_name",
...@@ -408,6 +550,7 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI, ...@@ -408,6 +550,7 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
assert "".join(chunks) == output assert "".join(chunks) == output
@pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
# just test 1 lora hereafter # just test 1 lora hereafter
"model_name", "model_name",
...@@ -461,6 +604,7 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI, ...@@ -461,6 +604,7 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI,
assert texts[0] == texts[1] assert texts[0] == texts[1]
@pytest.mark.asyncio
async def test_logits_bias(server, client: openai.AsyncOpenAI): async def test_logits_bias(server, client: openai.AsyncOpenAI):
prompt = "Hello, my name is" prompt = "Hello, my name is"
max_tokens = 5 max_tokens = 5
...@@ -508,6 +652,7 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI): ...@@ -508,6 +652,7 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI):
assert first_response != completion.choices[0].text assert first_response != completion.choices[0].text
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"]) ["outlines", "lm-format-enforcer"])
async def test_guided_json_completion(server, client: openai.AsyncOpenAI, async def test_guided_json_completion(server, client: openai.AsyncOpenAI,
...@@ -530,6 +675,7 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI, ...@@ -530,6 +675,7 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI,
jsonschema.validate(instance=output_json, schema=TEST_SCHEMA) jsonschema.validate(instance=output_json, schema=TEST_SCHEMA)
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"]) ["outlines", "lm-format-enforcer"])
async def test_guided_json_chat(server, client: openai.AsyncOpenAI, async def test_guided_json_chat(server, client: openai.AsyncOpenAI,
...@@ -576,6 +722,7 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI, ...@@ -576,6 +722,7 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI,
assert json1["age"] != json2["age"] assert json1["age"] != json2["age"]
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"]) ["outlines", "lm-format-enforcer"])
async def test_guided_regex_completion(server, client: openai.AsyncOpenAI, async def test_guided_regex_completion(server, client: openai.AsyncOpenAI,
...@@ -596,6 +743,7 @@ async def test_guided_regex_completion(server, client: openai.AsyncOpenAI, ...@@ -596,6 +743,7 @@ async def test_guided_regex_completion(server, client: openai.AsyncOpenAI,
assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"]) ["outlines", "lm-format-enforcer"])
async def test_guided_regex_chat(server, client: openai.AsyncOpenAI, async def test_guided_regex_chat(server, client: openai.AsyncOpenAI,
...@@ -633,6 +781,7 @@ async def test_guided_regex_chat(server, client: openai.AsyncOpenAI, ...@@ -633,6 +781,7 @@ async def test_guided_regex_chat(server, client: openai.AsyncOpenAI,
assert ip1 != ip2 assert ip1 != ip2
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"]) ["outlines", "lm-format-enforcer"])
async def test_guided_choice_completion(server, client: openai.AsyncOpenAI, async def test_guided_choice_completion(server, client: openai.AsyncOpenAI,
...@@ -652,6 +801,7 @@ async def test_guided_choice_completion(server, client: openai.AsyncOpenAI, ...@@ -652,6 +801,7 @@ async def test_guided_choice_completion(server, client: openai.AsyncOpenAI,
assert completion.choices[i].text in TEST_CHOICE assert completion.choices[i].text in TEST_CHOICE
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"]) ["outlines", "lm-format-enforcer"])
async def test_guided_choice_chat(server, client: openai.AsyncOpenAI, async def test_guided_choice_chat(server, client: openai.AsyncOpenAI,
...@@ -690,6 +840,7 @@ async def test_guided_choice_chat(server, client: openai.AsyncOpenAI, ...@@ -690,6 +840,7 @@ async def test_guided_choice_chat(server, client: openai.AsyncOpenAI,
assert choice1 != choice2 assert choice1 != choice2
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"]) ["outlines", "lm-format-enforcer"])
async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI, async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI,
...@@ -725,6 +876,7 @@ async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI, ...@@ -725,6 +876,7 @@ async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI,
extra_body=dict(guided_regex=TEST_REGEX, guided_json=TEST_SCHEMA)) extra_body=dict(guided_regex=TEST_REGEX, guided_json=TEST_SCHEMA))
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"]) ["outlines", "lm-format-enforcer"])
async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI, async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
...@@ -746,15 +898,15 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI, ...@@ -746,15 +898,15 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
top_logprobs=5, top_logprobs=5,
extra_body=dict(guided_choice=TEST_CHOICE, extra_body=dict(guided_choice=TEST_CHOICE,
guided_decoding_backend=guided_decoding_backend)) guided_decoding_backend=guided_decoding_backend))
top_logprobs = chat_completion.choices[0].logprobs.top_logprobs top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
# -9999.0 is the minimum logprob returned by OpenAI # -9999.0 is the minimum logprob returned by OpenAI
assert all( assert all(
isinstance(logprob, float) and logprob >= -9999.0 isinstance(token.logprob, float) and token.logprob >= -9999.0
for token_dict in top_logprobs for token in top_logprobs)
for token, logprob in token_dict.items())
@pytest.mark.asyncio
async def test_response_format_json_object(server, client: openai.AsyncOpenAI): async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
for _ in range(2): for _ in range(2):
resp = await client.chat.completions.create( resp = await client.chat.completions.create(
...@@ -772,6 +924,7 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI): ...@@ -772,6 +924,7 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
assert loaded == {"result": 2}, loaded assert loaded == {"result": 2}, loaded
@pytest.mark.asyncio
async def test_extra_fields(server, client: openai.AsyncOpenAI): async def test_extra_fields(server, client: openai.AsyncOpenAI):
with pytest.raises(BadRequestError) as exc_info: with pytest.raises(BadRequestError) as exc_info:
await client.chat.completions.create( await client.chat.completions.create(
...@@ -787,6 +940,7 @@ async def test_extra_fields(server, client: openai.AsyncOpenAI): ...@@ -787,6 +940,7 @@ async def test_extra_fields(server, client: openai.AsyncOpenAI):
assert "extra_forbidden" in exc_info.value.message assert "extra_forbidden" in exc_info.value.message
@pytest.mark.asyncio
async def test_complex_message_content(server, client: openai.AsyncOpenAI): async def test_complex_message_content(server, client: openai.AsyncOpenAI):
resp = await client.chat.completions.create( resp = await client.chat.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
...@@ -806,6 +960,38 @@ async def test_complex_message_content(server, client: openai.AsyncOpenAI): ...@@ -806,6 +960,38 @@ async def test_complex_message_content(server, client: openai.AsyncOpenAI):
assert content == "2" assert content == "2"
@pytest.mark.asyncio
async def test_custom_role(server, client: openai.AsyncOpenAI):
# Not sure how the model handles custom roles so we just check that
# both string and complex message content are handled in the same way
resp1 = await client.chat.completions.create(
model=MODEL_NAME,
messages=[{
"role": "my-custom-role",
"content": "what is 1+1?",
}], # type: ignore
temperature=0,
seed=0)
resp2 = await client.chat.completions.create(
model=MODEL_NAME,
messages=[{
"role": "my-custom-role",
"content": [{
"type": "text",
"text": "what is 1+1?"
}]
}], # type: ignore
temperature=0,
seed=0)
content1 = resp1.choices[0].message.content
content2 = resp2.choices[0].message.content
assert content1 == content2
@pytest.mark.asyncio
async def test_guided_grammar(server, client: openai.AsyncOpenAI): async def test_guided_grammar(server, client: openai.AsyncOpenAI):
simple_sql_grammar = """ simple_sql_grammar = """
start: select_statement start: select_statement
...@@ -840,6 +1026,7 @@ number: "1" | "2" ...@@ -840,6 +1026,7 @@ number: "1" | "2"
assert content.strip() == ground_truth assert content.strip() == ground_truth
@pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
# first test base model, then test loras # first test base model, then test loras
"model_name", "model_name",
...@@ -871,6 +1058,7 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI, ...@@ -871,6 +1058,7 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
assert len(logprobs.tokens) > 5 assert len(logprobs.tokens) > 5
@pytest.mark.asyncio
async def test_long_seed(server, client: openai.AsyncOpenAI): async def test_long_seed(server, client: openai.AsyncOpenAI):
for seed in [ for seed in [
torch.iinfo(torch.long).min - 1, torch.iinfo(torch.long).min - 1,
...@@ -890,5 +1078,81 @@ async def test_long_seed(server, client: openai.AsyncOpenAI): ...@@ -890,5 +1078,81 @@ async def test_long_seed(server, client: openai.AsyncOpenAI):
or "less_than_equal" in exc_info.value.message) or "less_than_equal" in exc_info.value.message)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[EMBEDDING_MODEL_NAME],
)
async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI,
model_name: str):
input = [
"The chef prepared a delicious meal.",
]
# test single embedding
embeddings = await client.embeddings.create(
model=model_name,
input=input,
encoding_format="float",
)
assert embeddings.id is not None
assert embeddings.data is not None and len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 9
assert embeddings.usage.total_tokens == 9
# test using token IDs
input = [1, 1, 1, 1, 1]
embeddings = await client.embeddings.create(
model=model_name,
input=input,
encoding_format="float",
)
assert embeddings.id is not None
assert embeddings.data is not None and len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 5
assert embeddings.usage.total_tokens == 5
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[EMBEDDING_MODEL_NAME],
)
async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
model_name: str):
# test List[str]
inputs = [
"The cat sat on the mat.", "A feline was resting on a rug.",
"Stars twinkle brightly in the night sky."
]
embeddings = await client.embeddings.create(
model=model_name,
input=inputs,
encoding_format="float",
)
assert embeddings.id is not None
assert embeddings.data is not None and len(embeddings.data) == 3
assert len(embeddings.data[0].embedding) == 4096
# test List[List[int]]
inputs = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
[25, 32, 64, 77]]
embeddings = await client.embeddings.create(
model=model_name,
input=inputs,
encoding_format="float",
)
assert embeddings.id is not None
assert embeddings.data is not None and len(embeddings.data) == 4
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 17
assert embeddings.usage.total_tokens == 17
if __name__ == "__main__": if __name__ == "__main__":
pytest.main([__file__]) pytest.main([__file__])
import multiprocessing
import sys import sys
import time import time
import pytest
import torch import torch
from openai import OpenAI, OpenAIError from openai import OpenAI, OpenAIError
...@@ -10,6 +10,8 @@ from vllm.model_executor.models.opt import OPTForCausalLM ...@@ -10,6 +10,8 @@ from vllm.model_executor.models.opt import OPTForCausalLM
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.utils import get_open_port from vllm.utils import get_open_port
pytestmark = pytest.mark.openai
class MyOPTForCausalLM(OPTForCausalLM): class MyOPTForCausalLM(OPTForCausalLM):
...@@ -26,15 +28,16 @@ def server_function(port): ...@@ -26,15 +28,16 @@ def server_function(port):
# register our dummy model # register our dummy model
ModelRegistry.register_model("OPTForCausalLM", MyOPTForCausalLM) ModelRegistry.register_model("OPTForCausalLM", MyOPTForCausalLM)
sys.argv = ["placeholder.py"] + \ sys.argv = ["placeholder.py"] + \
("--model facebook/opt-125m --dtype" ("--model facebook/opt-125m --gpu-memory-utilization 0.10 "
f" float32 --api-key token-abc123 --port {port}").split() f"--dtype float32 --api-key token-abc123 --port {port}").split()
import runpy import runpy
runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__') runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__')
def test_oot_registration_for_api_server(): def test_oot_registration_for_api_server():
port = get_open_port() port = get_open_port()
server = multiprocessing.Process(target=server_function, args=(port, )) ctx = torch.multiprocessing.get_context()
server = ctx.Process(target=server_function, args=(port, ))
server.start() server.start()
client = OpenAI( client = OpenAI(
base_url=f"http://localhost:{port}/v1", base_url=f"http://localhost:{port}/v1",
......
...@@ -2,11 +2,12 @@ from typing import Type ...@@ -2,11 +2,12 @@ from typing import Type
import pytest import pytest
import torch import torch
from allclose_default import get_default_atol, get_default_rtol
from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul, from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
NewGELU, SiluAndMul) NewGELU, SiluAndMul)
from .allclose_default import get_default_atol, get_default_rtol
DTYPES = [torch.half, torch.bfloat16, torch.float] DTYPES = [torch.half, torch.bfloat16, torch.float]
NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing
D = [512, 4096, 5120, 13824] # Arbitrary values for testing D = [512, 4096, 5120, 13824] # Arbitrary values for testing
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment