Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6640dc0b
Commit
6640dc0b
authored
Jun 20, 2024
by
zhuwenwen
Browse files
Merge branch 'main' of
http://10.6.10.68/dcutoolkit/deeplearing/vllm
parents
44d4d334
83e4e0fe
Changes
110
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
534 additions
and
326 deletions
+534
-326
tests/distributed/test_pynccl.py
tests/distributed/test_pynccl.py
+8
-4
tests/distributed/test_utils.py
tests/distributed/test_utils.py
+31
-0
tests/entrypoints/test_openai_embedding.py
tests/entrypoints/test_openai_embedding.py
+113
-0
tests/entrypoints/test_openai_server.py
tests/entrypoints/test_openai_server.py
+47
-159
tests/entrypoints/test_openai_vision.py
tests/entrypoints/test_openai_vision.py
+17
-18
tests/kernels/test_cutlass.py
tests/kernels/test_cutlass.py
+9
-9
tests/lora/conftest.py
tests/lora/conftest.py
+13
-10
tests/models/test_aqlm.py
tests/models/test_aqlm.py
+2
-11
tests/models/test_fp8.py
tests/models/test_fp8.py
+10
-10
tests/models/test_gptq_marlin.py
tests/models/test_gptq_marlin.py
+2
-11
tests/models/test_gptq_marlin_24.py
tests/models/test_gptq_marlin_24.py
+2
-11
tests/models/test_marlin.py
tests/models/test_marlin.py
+2
-11
tests/quantization/test_bitsandbytes.py
tests/quantization/test_bitsandbytes.py
+3
-7
tests/quantization/test_compressed_tensors.py
tests/quantization/test_compressed_tensors.py
+25
-2
tests/quantization/test_fp8.py
tests/quantization/test_fp8.py
+49
-7
tests/quantization/utils.py
tests/quantization/utils.py
+14
-0
tests/tensorizer_loader/test_tensorizer.py
tests/tensorizer_loader/test_tensorizer.py
+95
-18
tests/test_sharded_state_loader.py
tests/test_sharded_state_loader.py
+1
-1
tests/utils.py
tests/utils.py
+88
-36
tests/worker/test_model_runner.py
tests/worker/test_model_runner.py
+3
-1
No files found.
tests/distributed/test_pynccl.py
View file @
6640dc0b
...
@@ -6,10 +6,11 @@ import torch
...
@@ -6,10 +6,11 @@ import torch
import
torch.distributed
import
torch.distributed
from
vllm.distributed.communication_op
import
(
# noqa
from
vllm.distributed.communication_op
import
(
# noqa
graph_capture
,
tensor_model_parallel_all_reduce
)
tensor_model_parallel_all_reduce
)
from
vllm.distributed.device_communicators.pynccl
import
PyNcclCommunicator
from
vllm.distributed.device_communicators.pynccl
import
PyNcclCommunicator
from
vllm.distributed.device_communicators.pynccl_wrapper
import
NCCLLibrary
from
vllm.distributed.device_communicators.pynccl_wrapper
import
NCCLLibrary
from
vllm.distributed.parallel_state
import
(
ensure_model_parallel_initialized
,
from
vllm.distributed.parallel_state
import
(
ensure_model_parallel_initialized
,
get_world_group
,
graph_capture
,
init_distributed_environment
)
init_distributed_environment
)
from
vllm.utils
import
update_environment_variables
from
vllm.utils
import
update_environment_variables
...
@@ -53,7 +54,8 @@ def worker_fn_wrapper(fn):
...
@@ -53,7 +54,8 @@ def worker_fn_wrapper(fn):
@
worker_fn_wrapper
@
worker_fn_wrapper
def
worker_fn
():
def
worker_fn
():
pynccl_comm
=
PyNcclCommunicator
()
pynccl_comm
=
PyNcclCommunicator
(
get_world_group
().
cpu_group
,
device
=
get_world_group
().
device
)
tensor
=
torch
.
ones
(
16
,
1024
,
1024
,
tensor
=
torch
.
ones
(
16
,
1024
,
1024
,
dtype
=
torch
.
float32
).
cuda
(
pynccl_comm
.
rank
)
dtype
=
torch
.
float32
).
cuda
(
pynccl_comm
.
rank
)
with
pynccl_comm
.
change_state
(
enable
=
True
):
with
pynccl_comm
.
change_state
(
enable
=
True
):
...
@@ -129,7 +131,8 @@ def test_pynccl_multiple_allreduce_with_vllm():
...
@@ -129,7 +131,8 @@ def test_pynccl_multiple_allreduce_with_vllm():
def
worker_fn_with_cudagraph
():
def
worker_fn_with_cudagraph
():
with
torch
.
no_grad
():
with
torch
.
no_grad
():
graph
=
torch
.
cuda
.
CUDAGraph
()
graph
=
torch
.
cuda
.
CUDAGraph
()
pynccl_comm
=
PyNcclCommunicator
()
pynccl_comm
=
PyNcclCommunicator
(
get_world_group
().
cpu_group
,
device
=
get_world_group
().
device
)
# run something in the default stream to initialize torch engine
# run something in the default stream to initialize torch engine
a
=
torch
.
ones
((
4
,
4
),
device
=
f
'cuda:
{
pynccl_comm
.
rank
}
'
)
a
=
torch
.
ones
((
4
,
4
),
device
=
f
'cuda:
{
pynccl_comm
.
rank
}
'
)
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
()
...
@@ -154,7 +157,8 @@ def test_pynccl_with_cudagraph():
...
@@ -154,7 +157,8 @@ def test_pynccl_with_cudagraph():
@
worker_fn_wrapper
@
worker_fn_wrapper
def
send_recv_worker_fn
():
def
send_recv_worker_fn
():
pynccl_comm
=
PyNcclCommunicator
()
pynccl_comm
=
PyNcclCommunicator
(
get_world_group
().
cpu_group
,
device
=
get_world_group
().
device
)
if
pynccl_comm
.
rank
==
0
:
if
pynccl_comm
.
rank
==
0
:
tensor
=
torch
.
ones
(
16
,
1024
,
1024
,
tensor
=
torch
.
ones
(
16
,
1024
,
1024
,
dtype
=
torch
.
float32
).
cuda
(
pynccl_comm
.
rank
)
dtype
=
torch
.
float32
).
cuda
(
pynccl_comm
.
rank
)
...
...
tests/distributed/test_utils.py
0 → 100644
View file @
6640dc0b
import
os
import
ray
from
vllm.utils
import
cuda_device_count_stateless
@
ray
.
remote
class
_CUDADeviceCountStatelessTestActor
():
def
get_count
(
self
):
return
cuda_device_count_stateless
()
def
set_cuda_visible_devices
(
self
,
cuda_visible_devices
:
str
):
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
cuda_visible_devices
def
get_cuda_visible_devices
(
self
):
return
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
def
test_cuda_device_count_stateless
():
"""Test that cuda_device_count_stateless changes return value if
CUDA_VISIBLE_DEVICES is changed."""
actor
=
_CUDADeviceCountStatelessTestActor
.
options
(
num_gpus
=
2
).
remote
()
assert
ray
.
get
(
actor
.
get_cuda_visible_devices
.
remote
())
==
"0,1"
assert
ray
.
get
(
actor
.
get_count
.
remote
())
==
2
ray
.
get
(
actor
.
set_cuda_visible_devices
.
remote
(
"0"
))
assert
ray
.
get
(
actor
.
get_count
.
remote
())
==
1
ray
.
get
(
actor
.
set_cuda_visible_devices
.
remote
(
""
))
assert
ray
.
get
(
actor
.
get_count
.
remote
())
==
0
tests/entrypoints/test_openai_embedding.py
0 → 100644
View file @
6640dc0b
import
openai
import
pytest
import
ray
from
..utils
import
VLLM_PATH
,
RemoteOpenAIServer
EMBEDDING_MODEL_NAME
=
"intfloat/e5-mistral-7b-instruct"
pytestmark
=
pytest
.
mark
.
openai
@
pytest
.
fixture
(
scope
=
"module"
)
def
ray_ctx
():
ray
.
init
(
runtime_env
=
{
"working_dir"
:
VLLM_PATH
})
yield
ray
.
shutdown
()
@
pytest
.
fixture
(
scope
=
"module"
)
def
embedding_server
(
ray_ctx
):
return
RemoteOpenAIServer
([
"--model"
,
EMBEDDING_MODEL_NAME
,
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--enforce-eager"
,
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
])
@
pytest
.
mark
.
asyncio
@
pytest
.
fixture
(
scope
=
"module"
)
def
embedding_client
(
embedding_server
):
return
embedding_server
.
get_async_client
()
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
EMBEDDING_MODEL_NAME
],
)
async
def
test_single_embedding
(
embedding_client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
input_texts
=
[
"The chef prepared a delicious meal."
,
]
# test single embedding
embeddings
=
await
embedding_client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_texts
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
9
assert
embeddings
.
usage
.
total_tokens
==
9
# test using token IDs
input_tokens
=
[
1
,
1
,
1
,
1
,
1
]
embeddings
=
await
embedding_client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_tokens
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
5
assert
embeddings
.
usage
.
total_tokens
==
5
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
EMBEDDING_MODEL_NAME
],
)
async
def
test_batch_embedding
(
embedding_client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test List[str]
input_texts
=
[
"The cat sat on the mat."
,
"A feline was resting on a rug."
,
"Stars twinkle brightly in the night sky."
]
embeddings
=
await
embedding_client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_texts
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
3
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
# test List[List[int]]
input_tokens
=
[[
4
,
5
,
7
,
9
,
20
],
[
15
,
29
,
499
],
[
24
,
24
,
24
,
24
,
24
],
[
25
,
32
,
64
,
77
]]
embeddings
=
await
embedding_client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_tokens
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
4
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
17
assert
embeddings
.
usage
.
total_tokens
==
17
tests/entrypoints/test_openai_server.py
View file @
6640dc0b
...
@@ -15,11 +15,10 @@ from openai import BadRequestError
...
@@ -15,11 +15,10 @@ from openai import BadRequestError
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
..utils
import
ServerRunn
er
from
..utils
import
VLLM_PATH
,
RemoteOpenAIServ
er
# any model with a chat template should work here
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
EMBEDDING_MODEL_NAME
=
"intfloat/e5-mistral-7b-instruct"
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here
# generation quality here
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
...
@@ -80,9 +79,15 @@ def zephyr_lora_files():
...
@@ -80,9 +79,15 @@ def zephyr_lora_files():
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
zephyr_lora_files
):
def
ray_ctx
():
ray
.
init
()
ray
.
init
(
runtime_env
=
{
"working_dir"
:
VLLM_PATH
})
server_runner
=
ServerRunner
.
remote
([
yield
ray
.
shutdown
()
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
zephyr_lora_files
,
ray_ctx
):
return
RemoteOpenAIServer
([
"--model"
,
"--model"
,
MODEL_NAME
,
MODEL_NAME
,
# use half precision for speed and memory savings in CI environment
# use half precision for speed and memory savings in CI environment
...
@@ -91,8 +96,6 @@ def server(zephyr_lora_files):
...
@@ -91,8 +96,6 @@ def server(zephyr_lora_files):
"--max-model-len"
,
"--max-model-len"
,
"8192"
,
"8192"
,
"--enforce-eager"
,
"--enforce-eager"
,
"--gpu-memory-utilization"
,
"0.75"
,
# lora config below
# lora config below
"--enable-lora"
,
"--enable-lora"
,
"--lora-modules"
,
"--lora-modules"
,
...
@@ -105,43 +108,14 @@ def server(zephyr_lora_files):
...
@@ -105,43 +108,14 @@ def server(zephyr_lora_files):
"--max-num-seqs"
,
"--max-num-seqs"
,
"128"
,
"128"
,
])
])
ray
.
get
(
server_runner
.
ready
.
remote
())
yield
server_runner
ray
.
shutdown
()
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
embedding_server
(
zephyr_lora_files
):
def
client
(
server
):
ray
.
shutdown
()
return
server
.
get_async_client
()
ray
.
init
()
server_runner
=
ServerRunner
.
remote
([
"--model"
,
EMBEDDING_MODEL_NAME
,
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--enforce-eager"
,
"--gpu-memory-utilization"
,
"0.75"
,
"--max-model-len"
,
"8192"
,
])
ray
.
get
(
server_runner
.
ready
.
remote
())
yield
server_runner
ray
.
shutdown
()
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
():
client
=
openai
.
AsyncOpenAI
(
base_url
=
"http://localhost:8000/v1"
,
api_key
=
"token-abc123"
,
)
yield
client
@
pytest
.
mark
.
asyncio
async
def
test_check_models
(
client
:
openai
.
AsyncOpenAI
):
async
def
test_check_models
(
server
,
client
:
openai
.
AsyncOpenAI
):
models
=
await
client
.
models
.
list
()
models
=
await
client
.
models
.
list
()
models
=
models
.
data
models
=
models
.
data
served_model
=
models
[
0
]
served_model
=
models
[
0
]
...
@@ -158,8 +132,7 @@ async def test_check_models(server, client: openai.AsyncOpenAI):
...
@@ -158,8 +132,7 @@ async def test_check_models(server, client: openai.AsyncOpenAI):
"model_name"
,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
)
)
async
def
test_single_completion
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_single_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
max_tokens
=
5
,
...
@@ -190,8 +163,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI,
...
@@ -190,8 +163,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI,
"model_name"
,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
)
)
async
def
test_no_logprobs
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_no_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
):
# test using token IDs
# test using token IDs
completion
=
await
client
.
completions
.
create
(
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
...
@@ -210,8 +182,7 @@ async def test_no_logprobs(server, client: openai.AsyncOpenAI,
...
@@ -210,8 +182,7 @@ async def test_no_logprobs(server, client: openai.AsyncOpenAI,
"model_name"
,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
[
MODEL_NAME
,
"zephyr-lora"
],
)
)
async
def
test_zero_logprobs
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_zero_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
):
# test using token IDs
# test using token IDs
completion
=
await
client
.
completions
.
create
(
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
...
@@ -232,8 +203,7 @@ async def test_zero_logprobs(server, client: openai.AsyncOpenAI,
...
@@ -232,8 +203,7 @@ async def test_zero_logprobs(server, client: openai.AsyncOpenAI,
"model_name"
,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
[
MODEL_NAME
,
"zephyr-lora"
],
)
)
async
def
test_some_logprobs
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_some_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
):
# test using token IDs
# test using token IDs
completion
=
await
client
.
completions
.
create
(
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
...
@@ -254,7 +224,7 @@ async def test_some_logprobs(server, client: openai.AsyncOpenAI,
...
@@ -254,7 +224,7 @@ async def test_some_logprobs(server, client: openai.AsyncOpenAI,
"model_name"
,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
[
MODEL_NAME
,
"zephyr-lora"
],
)
)
async
def
test_too_many_completion_logprobs
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_too_many_completion_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
):
with
pytest
.
raises
(
with
pytest
.
raises
(
...
@@ -300,8 +270,7 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
...
@@ -300,8 +270,7 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
"model_name"
,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
)
)
async
def
test_no_logprobs_chat
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_no_logprobs_chat
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
):
messages
=
[{
messages
=
[{
"role"
:
"system"
,
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
"content"
:
"you are a helpful assistant"
...
@@ -326,8 +295,7 @@ async def test_no_logprobs_chat(server, client: openai.AsyncOpenAI,
...
@@ -326,8 +295,7 @@ async def test_no_logprobs_chat(server, client: openai.AsyncOpenAI,
"model_name"
,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
[
MODEL_NAME
,
"zephyr-lora"
],
)
)
async
def
test_zero_logprobs_chat
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_zero_logprobs_chat
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
):
messages
=
[{
messages
=
[{
"role"
:
"system"
,
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
"content"
:
"you are a helpful assistant"
...
@@ -354,8 +322,7 @@ async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI,
...
@@ -354,8 +322,7 @@ async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI,
"model_name"
,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
[
MODEL_NAME
,
"zephyr-lora"
],
)
)
async
def
test_some_logprobs_chat
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_some_logprobs_chat
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
):
messages
=
[{
messages
=
[{
"role"
:
"system"
,
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
"content"
:
"you are a helpful assistant"
...
@@ -382,7 +349,7 @@ async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI,
...
@@ -382,7 +349,7 @@ async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI,
"model_name"
,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
[
MODEL_NAME
,
"zephyr-lora"
],
)
)
async
def
test_too_many_chat_logprobs
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_too_many_chat_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
):
messages
=
[{
messages
=
[{
"role"
:
"system"
,
"role"
:
"system"
,
...
@@ -425,7 +392,7 @@ async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI,
...
@@ -425,7 +392,7 @@ async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI,
"model_name"
,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
[
MODEL_NAME
,
"zephyr-lora"
],
)
)
async
def
test_single_chat_session
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_single_chat_session
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
):
messages
=
[{
messages
=
[{
"role"
:
"system"
,
"role"
:
"system"
,
...
@@ -470,7 +437,7 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI,
...
@@ -470,7 +437,7 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI,
"model_name"
,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
[
MODEL_NAME
,
"zephyr-lora"
],
)
)
async
def
test_completion_streaming
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_completion_streaming
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
):
prompt
=
"What is an LLM?"
prompt
=
"What is an LLM?"
...
@@ -505,8 +472,7 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
...
@@ -505,8 +472,7 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
"model_name"
,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
[
MODEL_NAME
,
"zephyr-lora"
],
)
)
async
def
test_chat_streaming
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_chat_streaming
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
):
messages
=
[{
messages
=
[{
"role"
:
"system"
,
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
"content"
:
"you are a helpful assistant"
...
@@ -555,8 +521,7 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
...
@@ -555,8 +521,7 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
"model_name"
,
"model_name"
,
[
"HuggingFaceH4/zephyr-7b-beta"
,
"zephyr-lora"
],
[
"HuggingFaceH4/zephyr-7b-beta"
,
"zephyr-lora"
],
)
)
async
def
test_chat_completion_stream_options
(
server
,
async
def
test_chat_completion_stream_options
(
client
:
openai
.
AsyncOpenAI
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
):
messages
=
[{
messages
=
[{
"role"
:
"system"
,
"role"
:
"system"
,
...
@@ -626,7 +591,7 @@ async def test_chat_completion_stream_options(server,
...
@@ -626,7 +591,7 @@ async def test_chat_completion_stream_options(server,
"model_name"
,
"model_name"
,
[
"HuggingFaceH4/zephyr-7b-beta"
,
"zephyr-lora"
],
[
"HuggingFaceH4/zephyr-7b-beta"
,
"zephyr-lora"
],
)
)
async
def
test_completion_stream_options
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_completion_stream_options
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
):
prompt
=
"What is the capital of France?"
prompt
=
"What is the capital of France?"
...
@@ -688,8 +653,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
...
@@ -688,8 +653,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
"model_name"
,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
[
MODEL_NAME
,
"zephyr-lora"
],
)
)
async
def
test_batch_completions
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_batch_completions
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
):
# test simple list
# test simple list
batch
=
await
client
.
completions
.
create
(
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
model
=
model_name
,
...
@@ -737,7 +701,7 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI,
...
@@ -737,7 +701,7 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_logits_bias
(
server
,
client
:
openai
.
AsyncOpenAI
):
async
def
test_logits_bias
(
client
:
openai
.
AsyncOpenAI
):
prompt
=
"Hello, my name is"
prompt
=
"Hello, my name is"
max_tokens
=
5
max_tokens
=
5
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
...
@@ -786,7 +750,7 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI):
...
@@ -786,7 +750,7 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_json_completion
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_guided_json_completion
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
guided_decoding_backend
:
str
):
completion
=
await
client
.
completions
.
create
(
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
...
@@ -808,7 +772,7 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI,
...
@@ -808,7 +772,7 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_json_chat
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_guided_json_chat
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
guided_decoding_backend
:
str
):
messages
=
[{
messages
=
[{
"role"
:
"system"
,
"role"
:
"system"
,
...
@@ -855,7 +819,7 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI,
...
@@ -855,7 +819,7 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_regex_completion
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_guided_regex_completion
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
guided_decoding_backend
:
str
):
completion
=
await
client
.
completions
.
create
(
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
...
@@ -875,7 +839,7 @@ async def test_guided_regex_completion(server, client: openai.AsyncOpenAI,
...
@@ -875,7 +839,7 @@ async def test_guided_regex_completion(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_regex_chat
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_guided_regex_chat
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
guided_decoding_backend
:
str
):
messages
=
[{
messages
=
[{
"role"
:
"system"
,
"role"
:
"system"
,
...
@@ -913,7 +877,7 @@ async def test_guided_regex_chat(server, client: openai.AsyncOpenAI,
...
@@ -913,7 +877,7 @@ async def test_guided_regex_chat(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_choice_completion
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_guided_choice_completion
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
guided_decoding_backend
:
str
):
completion
=
await
client
.
completions
.
create
(
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
...
@@ -933,7 +897,7 @@ async def test_guided_choice_completion(server, client: openai.AsyncOpenAI,
...
@@ -933,7 +897,7 @@ async def test_guided_choice_completion(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_choice_chat
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_guided_choice_chat
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
guided_decoding_backend
:
str
):
messages
=
[{
messages
=
[{
"role"
:
"system"
,
"role"
:
"system"
,
...
@@ -972,7 +936,7 @@ async def test_guided_choice_chat(server, client: openai.AsyncOpenAI,
...
@@ -972,7 +936,7 @@ async def test_guided_choice_chat(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_decoding_type_error
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_guided_decoding_type_error
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
guided_decoding_backend
:
str
):
with
pytest
.
raises
(
openai
.
BadRequestError
):
with
pytest
.
raises
(
openai
.
BadRequestError
):
_
=
await
client
.
completions
.
create
(
_
=
await
client
.
completions
.
create
(
...
@@ -1008,7 +972,7 @@ async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI,
...
@@ -1008,7 +972,7 @@ async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_choice_chat_logprobs
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_guided_choice_chat_logprobs
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
guided_decoding_backend
:
str
):
messages
=
[{
messages
=
[{
"role"
:
"system"
,
"role"
:
"system"
,
...
@@ -1040,7 +1004,7 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
...
@@ -1040,7 +1004,7 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_named_tool_use
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_named_tool_use
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
guided_decoding_backend
:
str
):
messages
=
[{
messages
=
[{
"role"
:
"system"
,
"role"
:
"system"
,
...
@@ -1131,7 +1095,7 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI,
...
@@ -1131,7 +1095,7 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
])
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
])
async
def
test_required_tool_use_not_yet_supported
(
async
def
test_required_tool_use_not_yet_supported
(
server
,
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
messages
=
[{
messages
=
[{
"role"
:
"system"
,
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
"content"
:
"you are a helpful assistant"
...
@@ -1177,7 +1141,7 @@ async def test_required_tool_use_not_yet_supported(
...
@@ -1177,7 +1141,7 @@ async def test_required_tool_use_not_yet_supported(
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
])
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
])
async
def
test_inconsistent_tool_choice_and_tools
(
async
def
test_inconsistent_tool_choice_and_tools
(
server
,
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
messages
=
[{
messages
=
[{
"role"
:
"system"
,
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
"content"
:
"you are a helpful assistant"
...
@@ -1223,7 +1187,7 @@ async def test_inconsistent_tool_choice_and_tools(
...
@@ -1223,7 +1187,7 @@ async def test_inconsistent_tool_choice_and_tools(
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_response_format_json_object
(
server
,
client
:
openai
.
AsyncOpenAI
):
async
def
test_response_format_json_object
(
client
:
openai
.
AsyncOpenAI
):
for
_
in
range
(
2
):
for
_
in
range
(
2
):
resp
=
await
client
.
chat
.
completions
.
create
(
resp
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
...
@@ -1243,7 +1207,7 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
...
@@ -1243,7 +1207,7 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_extra_fields
(
server
,
client
:
openai
.
AsyncOpenAI
):
async
def
test_extra_fields
(
client
:
openai
.
AsyncOpenAI
):
with
pytest
.
raises
(
BadRequestError
)
as
exc_info
:
with
pytest
.
raises
(
BadRequestError
)
as
exc_info
:
await
client
.
chat
.
completions
.
create
(
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
...
@@ -1259,7 +1223,7 @@ async def test_extra_fields(server, client: openai.AsyncOpenAI):
...
@@ -1259,7 +1223,7 @@ async def test_extra_fields(server, client: openai.AsyncOpenAI):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_complex_message_content
(
server
,
client
:
openai
.
AsyncOpenAI
):
async
def
test_complex_message_content
(
client
:
openai
.
AsyncOpenAI
):
resp
=
await
client
.
chat
.
completions
.
create
(
resp
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
messages
=
[{
messages
=
[{
...
@@ -1279,7 +1243,7 @@ async def test_complex_message_content(server, client: openai.AsyncOpenAI):
...
@@ -1279,7 +1243,7 @@ async def test_complex_message_content(server, client: openai.AsyncOpenAI):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_custom_role
(
server
,
client
:
openai
.
AsyncOpenAI
):
async
def
test_custom_role
(
client
:
openai
.
AsyncOpenAI
):
# Not sure how the model handles custom roles so we just check that
# Not sure how the model handles custom roles so we just check that
# both string and complex message content are handled in the same way
# both string and complex message content are handled in the same way
...
@@ -1310,7 +1274,7 @@ async def test_custom_role(server, client: openai.AsyncOpenAI):
...
@@ -1310,7 +1274,7 @@ async def test_custom_role(server, client: openai.AsyncOpenAI):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_guided_grammar
(
server
,
client
:
openai
.
AsyncOpenAI
):
async
def
test_guided_grammar
(
client
:
openai
.
AsyncOpenAI
):
simple_sql_grammar
=
"""
simple_sql_grammar
=
"""
start: select_statement
start: select_statement
...
@@ -1351,7 +1315,7 @@ number: "1" | "2"
...
@@ -1351,7 +1315,7 @@ number: "1" | "2"
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
)
)
@
pytest
.
mark
.
parametrize
(
"logprobs_arg"
,
[
1
,
0
])
@
pytest
.
mark
.
parametrize
(
"logprobs_arg"
,
[
1
,
0
])
async
def
test_echo_logprob_completion
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_echo_logprob_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
logprobs_arg
:
int
):
model_name
:
str
,
logprobs_arg
:
int
):
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# test using text and token IDs
# test using text and token IDs
...
@@ -1380,7 +1344,7 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
...
@@ -1380,7 +1344,7 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_long_seed
(
server
,
client
:
openai
.
AsyncOpenAI
):
async
def
test_long_seed
(
client
:
openai
.
AsyncOpenAI
):
for
seed
in
[
for
seed
in
[
torch
.
iinfo
(
torch
.
long
).
min
-
1
,
torch
.
iinfo
(
torch
.
long
).
min
-
1
,
torch
.
iinfo
(
torch
.
long
).
max
+
1
torch
.
iinfo
(
torch
.
long
).
max
+
1
...
@@ -1399,81 +1363,5 @@ async def test_long_seed(server, client: openai.AsyncOpenAI):
...
@@ -1399,81 +1363,5 @@ async def test_long_seed(server, client: openai.AsyncOpenAI):
or
"less_than_equal"
in
exc_info
.
value
.
message
)
or
"less_than_equal"
in
exc_info
.
value
.
message
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
EMBEDDING_MODEL_NAME
],
)
async
def
test_single_embedding
(
embedding_server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
input_texts
=
[
"The chef prepared a delicious meal."
,
]
# test single embedding
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_texts
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
9
assert
embeddings
.
usage
.
total_tokens
==
9
# test using token IDs
input_tokens
=
[
1
,
1
,
1
,
1
,
1
]
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_tokens
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
5
assert
embeddings
.
usage
.
total_tokens
==
5
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
EMBEDDING_MODEL_NAME
],
)
async
def
test_batch_embedding
(
embedding_server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test List[str]
input_texts
=
[
"The cat sat on the mat."
,
"A feline was resting on a rug."
,
"Stars twinkle brightly in the night sky."
]
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_texts
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
3
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
# test List[List[int]]
input_tokens
=
[[
4
,
5
,
7
,
9
,
20
],
[
15
,
29
,
499
],
[
24
,
24
,
24
,
24
,
24
],
[
25
,
32
,
64
,
77
]]
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_tokens
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
4
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
17
assert
embeddings
.
usage
.
total_tokens
==
17
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
pytest
.
main
([
__file__
])
pytest
.
main
([
__file__
])
tests/entrypoints/test_openai_vision.py
View file @
6640dc0b
...
@@ -8,7 +8,7 @@ import ray
...
@@ -8,7 +8,7 @@ import ray
from
vllm.multimodal.utils
import
ImageFetchAiohttp
,
encode_image_base64
from
vllm.multimodal.utils
import
ImageFetchAiohttp
,
encode_image_base64
from
..utils
import
ServerRunn
er
from
..utils
import
VLLM_PATH
,
RemoteOpenAIServ
er
MODEL_NAME
=
"llava-hf/llava-1.5-7b-hf"
MODEL_NAME
=
"llava-hf/llava-1.5-7b-hf"
LLAVA_CHAT_TEMPLATE
=
(
Path
(
__file__
).
parent
.
parent
.
parent
/
LLAVA_CHAT_TEMPLATE
=
(
Path
(
__file__
).
parent
.
parent
.
parent
/
...
@@ -25,10 +25,16 @@ TEST_IMAGE_URLS = [
...
@@ -25,10 +25,16 @@ TEST_IMAGE_URLS = [
pytestmark
=
pytest
.
mark
.
openai
pytestmark
=
pytest
.
mark
.
openai
@
pytest
.
fixture
(
scope
=
"module"
)
def
ray_ctx
():
ray
.
init
(
runtime_env
=
{
"working_dir"
:
VLLM_PATH
})
yield
ray
.
shutdown
()
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
def
server
():
ray
.
init
()
return
RemoteOpenAIServer
([
server_runner
=
ServerRunner
.
remote
([
"--model"
,
"--model"
,
MODEL_NAME
,
MODEL_NAME
,
"--dtype"
,
"--dtype"
,
...
@@ -47,18 +53,11 @@ def server():
...
@@ -47,18 +53,11 @@ def server():
"--chat-template"
,
"--chat-template"
,
str
(
LLAVA_CHAT_TEMPLATE
),
str
(
LLAVA_CHAT_TEMPLATE
),
])
])
ray
.
get
(
server_runner
.
ready
.
remote
())
yield
server_runner
ray
.
shutdown
()
@
pytest
.
fixture
(
scope
=
"session"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
():
def
client
(
server
):
client
=
openai
.
AsyncOpenAI
(
return
server
.
get_async_client
()
base_url
=
"http://localhost:8000/v1"
,
api_key
=
"token-abc123"
,
)
yield
client
@
pytest_asyncio
.
fixture
(
scope
=
"session"
)
@
pytest_asyncio
.
fixture
(
scope
=
"session"
)
...
@@ -73,7 +72,7 @@ async def base64_encoded_image() -> Dict[str, str]:
...
@@ -73,7 +72,7 @@ async def base64_encoded_image() -> Dict[str, str]:
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
async
def
test_single_chat_session_image
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_single_chat_session_image
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
):
model_name
:
str
,
image_url
:
str
):
messages
=
[{
messages
=
[{
"role"
:
"role"
:
...
@@ -126,7 +125,7 @@ async def test_single_chat_session_image(server, client: openai.AsyncOpenAI,
...
@@ -126,7 +125,7 @@ async def test_single_chat_session_image(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
async
def
test_single_chat_session_image_base64encoded
(
async
def
test_single_chat_session_image_base64encoded
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
,
base64_encoded_image
:
Dict
[
str
,
str
]):
base64_encoded_image
:
Dict
[
str
,
str
]):
messages
=
[{
messages
=
[{
...
@@ -180,7 +179,7 @@ async def test_single_chat_session_image_base64encoded(
...
@@ -180,7 +179,7 @@ async def test_single_chat_session_image_base64encoded(
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
async
def
test_chat_streaming_image
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_chat_streaming_image
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
):
model_name
:
str
,
image_url
:
str
):
messages
=
[{
messages
=
[{
"role"
:
"role"
:
...
@@ -237,8 +236,8 @@ async def test_chat_streaming_image(server, client: openai.AsyncOpenAI,
...
@@ -237,8 +236,8 @@ async def test_chat_streaming_image(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
async
def
test_multi_image_input
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_multi_image_input
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
model_name
:
str
,
image_url
:
str
):
image_url
:
str
):
messages
=
[{
messages
=
[{
"role"
:
"role"
:
...
...
tests/kernels/test_cutlass.py
View file @
6640dc0b
...
@@ -47,7 +47,7 @@ def cutlass_fp8_gemm_helper(m: int,
...
@@ -47,7 +47,7 @@ def cutlass_fp8_gemm_helper(m: int,
scale_b
=
(
torch
.
randn
(
scale_b
=
(
torch
.
randn
(
(
1
,
n_b_scales
),
device
=
device
,
dtype
=
torch
.
float32
)
/
10
)
(
1
,
n_b_scales
),
device
=
device
,
dtype
=
torch
.
float32
)
/
10
)
out
=
ops
.
cutlass_scaled_mm
_dq
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
)
out
=
ops
.
cutlass_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
)
baseline
=
torch
.
mm
(
scale_a
*
a
.
to
(
dtype
=
torch
.
float32
),
baseline
=
torch
.
mm
(
scale_a
*
a
.
to
(
dtype
=
torch
.
float32
),
scale_b
*
b
.
to
(
dtype
=
torch
.
float32
)).
to
(
out_dtype
)
scale_b
*
b
.
to
(
dtype
=
torch
.
float32
)).
to
(
out_dtype
)
...
@@ -74,7 +74,7 @@ def cutlass_int8_gemm_helper(m: int,
...
@@ -74,7 +74,7 @@ def cutlass_int8_gemm_helper(m: int,
scale_b
=
(
torch
.
randn
(
scale_b
=
(
torch
.
randn
(
(
1
,
n_b_scales
),
device
=
device
,
dtype
=
torch
.
float32
)
/
10
)
(
1
,
n_b_scales
),
device
=
device
,
dtype
=
torch
.
float32
)
/
10
)
out
=
ops
.
cutlass_scaled_mm
_dq
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
)
out
=
ops
.
cutlass_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
)
baseline
=
torch
.
mm
(
scale_a
*
a
.
to
(
dtype
=
torch
.
float32
),
baseline
=
torch
.
mm
(
scale_a
*
a
.
to
(
dtype
=
torch
.
float32
),
scale_b
*
scale_b
*
b
.
to
(
dtype
=
torch
.
float32
)).
to
(
dtype
=
out_dtype
)
b
.
to
(
dtype
=
torch
.
float32
)).
to
(
dtype
=
out_dtype
)
...
@@ -180,11 +180,11 @@ def test_cutlass_subset():
...
@@ -180,11 +180,11 @@ def test_cutlass_subset():
scale_a
=
torch
.
randn
((
1
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
/
10
scale_a
=
torch
.
randn
((
1
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
/
10
scale_b
=
torch
.
randn
((
1
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
/
10
scale_b
=
torch
.
randn
((
1
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
/
10
out
=
ops
.
cutlass_scaled_mm
_dq
(
a
,
out
=
ops
.
cutlass_scaled_mm
(
a
,
b
,
b
,
scale_a
,
scale_a
,
scale_b
,
scale_b
,
out_dtype
=
torch
.
bfloat16
)
out_dtype
=
torch
.
bfloat16
)
baseline
=
torch
.
mm
(
scale_a
*
a
.
to
(
dtype
=
torch
.
float32
),
baseline
=
torch
.
mm
(
scale_a
*
a
.
to
(
dtype
=
torch
.
float32
),
scale_b
*
scale_b
*
b
.
to
(
dtype
=
torch
.
float32
)).
to
(
dtype
=
torch
.
bfloat16
)
b
.
to
(
dtype
=
torch
.
float32
)).
to
(
dtype
=
torch
.
bfloat16
)
...
@@ -203,8 +203,8 @@ class CutlassLayer(torch.nn.Module):
...
@@ -203,8 +203,8 @@ class CutlassLayer(torch.nn.Module):
self
.
out_dtype
=
out_dtype
self
.
out_dtype
=
out_dtype
def
forward
(
self
,
a
):
def
forward
(
self
,
a
):
return
ops
.
cutlass_scaled_mm
_dq
(
a
,
self
.
b
,
self
.
scale_a
,
self
.
scale_b
,
return
ops
.
cutlass_scaled_mm
(
a
,
self
.
b
,
self
.
scale_a
,
self
.
scale_b
,
self
.
out_dtype
)
self
.
out_dtype
)
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
...
...
tests/lora/conftest.py
View file @
6640dc0b
...
@@ -12,7 +12,10 @@ from huggingface_hub import snapshot_download
...
@@ -12,7 +12,10 @@ from huggingface_hub import snapshot_download
import
vllm
import
vllm
from
vllm.config
import
LoRAConfig
from
vllm.config
import
LoRAConfig
from
vllm.distributed
import
destroy_model_parallel
,
initialize_model_parallel
from
vllm.distributed
import
(
destroy_distributed_environment
,
destroy_model_parallel
,
init_distributed_environment
,
initialize_model_parallel
)
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
MergedColumnParallelLinear
,
MergedColumnParallelLinear
,
RowParallelLinear
)
RowParallelLinear
)
...
@@ -35,6 +38,7 @@ LONG_LORA_INFOS = [{
...
@@ -35,6 +38,7 @@ LONG_LORA_INFOS = [{
def
cleanup
():
def
cleanup
():
destroy_model_parallel
()
destroy_model_parallel
()
destroy_distributed_environment
()
with
contextlib
.
suppress
(
AssertionError
):
with
contextlib
.
suppress
(
AssertionError
):
torch
.
distributed
.
destroy_process_group
()
torch
.
distributed
.
destroy_process_group
()
gc
.
collect
()
gc
.
collect
()
...
@@ -64,15 +68,14 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
...
@@ -64,15 +68,14 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
@
pytest
.
fixture
@
pytest
.
fixture
def
dist_init
():
def
dist_init
():
if
not
torch
.
distributed
.
is_initialized
():
temp_file
=
tempfile
.
mkstemp
()[
1
]
temp_file
=
tempfile
.
mkstemp
()[
1
]
init_distributed_environment
(
torch
.
distributed
.
init_process_group
(
world_size
=
1
,
backend
=
"nccl"
,
rank
=
0
,
world_size
=
1
,
distributed_init_method
=
f
"file://
{
temp_file
}
"
,
rank
=
0
,
local_rank
=
0
,
init_method
=
f
"file://
{
temp_file
}
"
,
backend
=
"nccl"
,
)
)
torch
.
distributed
.
all_reduce
(
torch
.
zeros
(
1
).
cuda
())
initialize_model_parallel
(
1
,
1
)
initialize_model_parallel
(
1
,
1
)
yield
yield
cleanup
()
cleanup
()
...
...
tests/models/test_aqlm.py
View file @
6640dc0b
...
@@ -4,17 +4,8 @@ Run `pytest tests/models/test_aqlm.py`.
...
@@ -4,17 +4,8 @@ Run `pytest tests/models/test_aqlm.py`.
"""
"""
import
pytest
import
pytest
import
torch
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
from
tests.quantization.utils
import
is_quant_method_supported
aqlm_not_supported
=
True
if
torch
.
cuda
.
is_available
():
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
aqlm_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"aqlm"
].
get_min_capability
())
# In this test we hardcode prompts and generations for the model so we don't
# In this test we hardcode prompts and generations for the model so we don't
# need to require the AQLM package as a dependency
# need to require the AQLM package as a dependency
...
@@ -67,7 +58,7 @@ ground_truth_generations = [
...
@@ -67,7 +58,7 @@ ground_truth_generations = [
]
]
@
pytest
.
mark
.
skipif
(
aqlm_not
_supported
,
@
pytest
.
mark
.
skipif
(
not
is_quant_method
_supported
(
"aqlm"
)
,
reason
=
"AQLM is not supported on this GPU type."
)
reason
=
"AQLM is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
...
...
tests/models/test_fp8.py
View file @
6640dc0b
...
@@ -8,8 +8,8 @@ import pytest
...
@@ -8,8 +8,8 @@ import pytest
import
torch
import
torch
from
transformers
import
AutoTokenizer
from
transformers
import
AutoTokenizer
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"true"
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"true"
...
@@ -67,16 +67,16 @@ EXPECTED_STRS_MAP = {
...
@@ -67,16 +67,16 @@ EXPECTED_STRS_MAP = {
},
},
}
}
fp8_not_supported
=
True
if
torch
.
cuda
.
is_available
():
# This test compares against golden strings for exact match since
capability
=
torch
.
cuda
.
get_device_capability
()
# there is no baseline implementation to compare against
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
# and is unstable w.r.t specifics of the fp8 implementation or
fp8_not_supported
=
(
capability
<
# the hardware being run on.
QUANTIZATION_METHODS
[
"fp8"
].
get_min_capability
())
# Disabled to prevent it from breaking the build
@
pytest
.
mark
.
skip
(
reason
=
@
pytest
.
mark
.
skipif
(
fp8_not_supported
,
"Prevent unstable test based on golden strings from breaking the build."
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
reason
=
"fp8 is not supported on this GPU type."
)
reason
=
"fp8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype"
,
[
"auto"
,
"fp8"
])
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype"
,
[
"auto"
,
"fp8"
])
...
...
tests/models/test_gptq_marlin.py
View file @
6640dc0b
...
@@ -11,9 +11,8 @@ Run `pytest tests/models/test_gptq_marlin.py`.
...
@@ -11,9 +11,8 @@ Run `pytest tests/models/test_gptq_marlin.py`.
import
os
import
os
import
pytest
import
pytest
import
torch
from
vllm.model_executor.layer
s.quantization
import
QUANTIZATION_METHODS
from
test
s.quantization
.utils
import
is_quant_method_supported
from
vllm.model_executor.layers.rotary_embedding
import
_ROPE_DICT
from
vllm.model_executor.layers.rotary_embedding
import
_ROPE_DICT
from
.utils
import
check_logprobs_close
from
.utils
import
check_logprobs_close
...
@@ -22,14 +21,6 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
...
@@ -22,14 +21,6 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
MAX_MODEL_LEN
=
1024
MAX_MODEL_LEN
=
1024
gptq_marlin_not_supported
=
True
if
torch
.
cuda
.
is_available
():
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
gptq_marlin_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"gptq_marlin"
].
get_min_capability
())
MODELS
=
[
MODELS
=
[
# act_order==False, group_size=channelwise
# act_order==False, group_size=channelwise
(
"robertgshaw2/zephyr-7b-beta-channelwise-gptq"
,
"main"
),
(
"robertgshaw2/zephyr-7b-beta-channelwise-gptq"
,
"main"
),
...
@@ -53,7 +44,7 @@ MODELS = [
...
@@ -53,7 +44,7 @@ MODELS = [
@
pytest
.
mark
.
flaky
(
reruns
=
3
)
@
pytest
.
mark
.
flaky
(
reruns
=
3
)
@
pytest
.
mark
.
skipif
(
gptq_marlin_not_supported
,
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
)
,
reason
=
"gptq_marlin is not supported on this GPU type."
)
reason
=
"gptq_marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
,
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
,
"bfloat16"
])
...
...
tests/models/test_gptq_marlin_24.py
View file @
6640dc0b
...
@@ -9,18 +9,9 @@ Run `pytest tests/models/test_marlin_24.py`.
...
@@ -9,18 +9,9 @@ Run `pytest tests/models/test_marlin_24.py`.
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
import
pytest
import
pytest
import
torch
from
tests.models.utils
import
check_logprobs_close
from
tests.models.utils
import
check_logprobs_close
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
from
tests.quantization.utils
import
is_quant_method_supported
marlin_not_supported
=
True
if
torch
.
cuda
.
is_available
():
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
marlin_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"marlin"
].
get_min_capability
())
@
dataclass
@
dataclass
...
@@ -47,7 +38,7 @@ model_pairs = [
...
@@ -47,7 +38,7 @@ model_pairs = [
@
pytest
.
mark
.
flaky
(
reruns
=
2
)
@
pytest
.
mark
.
flaky
(
reruns
=
2
)
@
pytest
.
mark
.
skipif
(
marlin_not_supported
,
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin_24"
)
,
reason
=
"Marlin24 is not supported on this GPU type."
)
reason
=
"Marlin24 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_pair"
,
model_pairs
)
@
pytest
.
mark
.
parametrize
(
"model_pair"
,
model_pairs
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
...
...
tests/models/test_marlin.py
View file @
6640dc0b
...
@@ -13,20 +13,11 @@ Run `pytest tests/models/test_marlin.py`.
...
@@ -13,20 +13,11 @@ Run `pytest tests/models/test_marlin.py`.
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
import
pytest
import
pytest
import
torch
from
vllm.model_executor.layer
s.quantization
import
QUANTIZATION_METHODS
from
test
s.quantization
.utils
import
is_quant_method_supported
from
.utils
import
check_logprobs_close
from
.utils
import
check_logprobs_close
marlin_not_supported
=
True
if
torch
.
cuda
.
is_available
():
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
marlin_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"marlin"
].
get_min_capability
())
@
dataclass
@
dataclass
class
ModelPair
:
class
ModelPair
:
...
@@ -45,7 +36,7 @@ model_pairs = [
...
@@ -45,7 +36,7 @@ model_pairs = [
@
pytest
.
mark
.
flaky
(
reruns
=
2
)
@
pytest
.
mark
.
flaky
(
reruns
=
2
)
@
pytest
.
mark
.
skipif
(
marlin_not_supported
,
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"marlin"
)
,
reason
=
"Marlin is not supported on this GPU type."
)
reason
=
"Marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_pair"
,
model_pairs
)
@
pytest
.
mark
.
parametrize
(
"model_pair"
,
model_pairs
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
...
...
tests/quantization/test_bitsandbytes.py
View file @
6640dc0b
...
@@ -5,16 +5,12 @@ Run `pytest tests/quantization/test_bitsandbytes.py`.
...
@@ -5,16 +5,12 @@ Run `pytest tests/quantization/test_bitsandbytes.py`.
import
pytest
import
pytest
import
torch
import
torch
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm
import
SamplingParams
from
vllm
import
SamplingParams
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"bitsandbytes"
),
@
pytest
.
mark
.
skipif
(
reason
=
'bitsandbytes is not supported on this GPU type.'
)
capability
<
QUANTIZATION_METHODS
[
'bitsandbytes'
].
get_min_capability
(),
reason
=
'bitsandbytes is not supported on this GPU type.'
)
def
test_load_bnb_model
(
vllm_runner
)
->
None
:
def
test_load_bnb_model
(
vllm_runner
)
->
None
:
with
vllm_runner
(
'huggyllama/llama-7b'
,
with
vllm_runner
(
'huggyllama/llama-7b'
,
quantization
=
'bitsandbytes'
,
quantization
=
'bitsandbytes'
,
...
...
tests/quantization/test_compressed_tensors.py
View file @
6640dc0b
...
@@ -3,12 +3,13 @@
...
@@ -3,12 +3,13 @@
Run `pytest tests/quantization/test_compressed_tensors.py`.
Run `pytest tests/quantization/test_compressed_tensors.py`.
"""
"""
import
pytest
import
torch
import
torch
from
vllm
import
SamplingParams
from
vllm
import
SamplingParams
from
vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors
import
(
# noqa: E501
from
vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors
import
(
# noqa: E501
CompressedTensorsLinearMethod
,
CompressedTensorsW
8A8DynamicToken
,
CompressedTensorsLinearMethod
,
CompressedTensorsW
4A16
,
CompressedTensorsW8A8StaticTensor
)
CompressedTensorsW8A8DynamicToken
,
CompressedTensorsW8A8StaticTensor
)
def
test_compressed_tensors_w8a8_static_setup
(
vllm_runner
):
def
test_compressed_tensors_w8a8_static_setup
(
vllm_runner
):
...
@@ -60,3 +61,25 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
...
@@ -60,3 +61,25 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
assert
isinstance
(
qkv_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
qkv_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
qkv_proj
.
scheme
,
CompressedTensorsW8A8DynamicToken
)
assert
isinstance
(
qkv_proj
.
scheme
,
CompressedTensorsW8A8DynamicToken
)
assert
qkv_proj
.
weight
.
dtype
is
torch
.
int8
assert
qkv_proj
.
weight
.
dtype
is
torch
.
int8
@
pytest
.
mark
.
parametrize
(
"w4a16_args"
,
[
(
"nm-testing/tinyllama-oneshot-w4a16-channel-v2"
,
"channel"
,
None
),
(
"nm-testing/tinyllama-oneshot-w4a16-group128-v2"
,
"group"
,
128
),
])
def
test_compressed_tensors_w4a16
(
vllm_runner
,
w4a16_args
):
model
,
strategy
,
group
=
w4a16_args
with
vllm_runner
(
model
)
as
llm
:
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
# noqa: E501
layer
=
model
.
model
.
layers
[
0
]
qkv_proj
=
layer
.
self_attn
.
qkv_proj
assert
isinstance
(
qkv_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
qkv_proj
.
scheme
,
CompressedTensorsW4A16
)
assert
qkv_proj
.
scheme
.
strategy
==
strategy
assert
qkv_proj
.
scheme
.
group_size
==
group
assert
qkv_proj
.
weight_packed
.
dtype
is
torch
.
int32
assert
qkv_proj
.
weight_scale
.
dtype
is
torch
.
float16
assert
qkv_proj
.
weight_packed
.
pack_factor
==
8
tests/quantization/test_fp8.py
View file @
6640dc0b
...
@@ -5,16 +5,13 @@ Run `pytest tests/quantization/test_fp8.py --forked`.
...
@@ -5,16 +5,13 @@ Run `pytest tests/quantization/test_fp8.py --forked`.
import
pytest
import
pytest
import
torch
import
torch
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm._custom_ops
import
scaled_fp8_quant
from
vllm.model_executor.layers.quantization.fp8
import
Fp8LinearMethod
from
vllm.model_executor.layers.quantization.fp8
import
Fp8LinearMethod
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
@
pytest
.
mark
.
skipif
(
reason
=
"FP8 is not supported on this GPU type."
)
capability
<
QUANTIZATION_METHODS
[
"fp8"
].
get_min_capability
(),
reason
=
"FP8 is not supported on this GPU type."
)
def
test_load_fp16_model
(
vllm_runner
)
->
None
:
def
test_load_fp16_model
(
vllm_runner
)
->
None
:
with
vllm_runner
(
"facebook/opt-125m"
,
quantization
=
"fp8"
)
as
llm
:
with
vllm_runner
(
"facebook/opt-125m"
,
quantization
=
"fp8"
)
as
llm
:
...
@@ -22,3 +19,48 @@ def test_load_fp16_model(vllm_runner) -> None:
...
@@ -22,3 +19,48 @@ def test_load_fp16_model(vllm_runner) -> None:
fc1
=
model
.
model
.
decoder
.
layers
[
0
].
fc1
fc1
=
model
.
model
.
decoder
.
layers
[
0
].
fc1
assert
isinstance
(
fc1
.
quant_method
,
Fp8LinearMethod
)
assert
isinstance
(
fc1
.
quant_method
,
Fp8LinearMethod
)
assert
fc1
.
weight
.
dtype
==
torch
.
float8_e4m3fn
assert
fc1
.
weight
.
dtype
==
torch
.
float8_e4m3fn
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
reason
=
"FP8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float16
,
torch
.
bfloat16
])
def
test_scaled_fp8_quant
(
dtype
)
->
None
:
def
quantize_ref
(
tensor
,
inv_scale
):
# The reference implementation that fully aligns to
# the kernel being tested.
finfo
=
torch
.
finfo
(
torch
.
float8_e4m3fn
)
scale
=
inv_scale
.
reciprocal
()
qweight
=
(
tensor
.
to
(
torch
.
float32
)
*
scale
).
clamp
(
min
=
finfo
.
min
,
max
=
finfo
.
max
)
qweight
=
qweight
.
to
(
torch
.
float8_e4m3fn
)
return
qweight
def
per_tensor_dequantize
(
tensor
,
inv_scale
,
dtype
):
fake_qweight
=
tensor
.
to
(
dtype
)
dq_weight
=
fake_qweight
*
inv_scale
return
dq_weight
# Note that we use a shape % 4 != 0 to cover edge cases,
# because scaled_fp8_quant is vectorized by 4.
x
=
(
torch
.
randn
(
size
=
(
11
,
11
),
device
=
"cuda"
)
*
13
).
to
(
dtype
)
# Dynamic quantization
ref_y
,
inv_scale
=
scaled_fp8_quant
(
x
,
None
)
ref_y
=
per_tensor_dequantize
(
ref_y
,
inv_scale
,
dtype
)
# Reference dynamic quantizaton
y
=
quantize_ref
(
x
,
inv_scale
)
assert
torch
.
allclose
(
ref_y
,
per_tensor_dequantize
(
y
,
inv_scale
,
dtype
))
# Static quantization
y
,
_
=
scaled_fp8_quant
(
x
,
inv_scale
)
assert
torch
.
allclose
(
ref_y
,
per_tensor_dequantize
(
y
,
inv_scale
,
dtype
))
# Padding
y
,
_
=
scaled_fp8_quant
(
x
,
inv_scale
,
batch_dim_padding
=
17
)
assert
y
.
shape
[
0
]
==
17
assert
torch
.
allclose
(
ref_y
,
per_tensor_dequantize
(
torch
.
narrow
(
y
,
0
,
0
,
x
.
shape
[
0
]),
inv_scale
,
dtype
))
tests/quantization/utils.py
0 → 100644
View file @
6640dc0b
import
torch
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
def
is_quant_method_supported
(
quant_method
:
str
)
->
bool
:
# Currently, all quantization methods require Nvidia or AMD GPUs
if
not
torch
.
cuda
.
is_available
():
return
False
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
return
(
capability
<
QUANTIZATION_METHODS
[
quant_method
].
get_min_capability
())
tests/tensorizer_loader/test_tensorizer.py
View file @
6640dc0b
import
json
import
json
import
os
import
os
import
pathlib
import
subprocess
import
subprocess
from
unittest.mock
import
MagicMock
,
patch
from
unittest.mock
import
MagicMock
,
patch
import
openai
import
openai
import
pytest
import
pytest
import
ray
import
ray
import
torch
from
tensorizer
import
EncryptionParams
from
vllm
import
SamplingParams
from
vllm
import
SamplingParams
from
vllm.engine.arg_utils
import
EngineArgs
# yapf: disable
# yapf: disable
from
vllm.model_executor.model_loader.tensorizer
import
(
TensorizerConfig
,
from
vllm.model_executor.model_loader.tensorizer
import
(
TensorizerConfig
,
TensorSerializer
,
TensorSerializer
,
is_vllm_tensorized
,
is_vllm_tensorized
,
load_with_tensorizer
,
load_with_tensorizer
,
open_stream
,
open_stream
,
serialize_vllm_model
)
serialize_vllm_model
,
tensorize_vllm_model
)
from
..utils
import
ServerRunner
from
..conftest
import
VllmRunner
,
cleanup
from
..utils
import
RemoteOpenAIServer
# yapf conflicts with isort for this docstring
# yapf conflicts with isort for this docstring
prompts
=
[
prompts
=
[
"Hello, my name is"
,
"Hello, my name is"
,
"The president of the United States is"
,
"The president of the United States is"
,
...
@@ -42,6 +49,20 @@ def is_curl_installed():
...
@@ -42,6 +49,20 @@ def is_curl_installed():
except
(
subprocess
.
CalledProcessError
,
FileNotFoundError
):
except
(
subprocess
.
CalledProcessError
,
FileNotFoundError
):
return
False
return
False
def
get_torch_model
(
vllm_runner
:
VllmRunner
):
return
vllm_runner
\
.
model
\
.
llm_engine
\
.
model_executor
\
.
driver_worker
\
.
model_runner
\
.
model
def
write_keyfile
(
keyfile_path
:
str
):
encryption_params
=
EncryptionParams
.
random
()
pathlib
.
Path
(
keyfile_path
).
parent
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
with
open
(
keyfile_path
,
'wb'
)
as
f
:
f
.
write
(
encryption_params
.
key
)
@
pytest
.
fixture
(
autouse
=
True
)
@
pytest
.
fixture
(
autouse
=
True
)
def
tensorizer_config
():
def
tensorizer_config
():
...
@@ -88,12 +109,17 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
...
@@ -88,12 +109,17 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
with
vllm_runner
(
model_ref
)
as
vllm_model
:
with
vllm_runner
(
model_ref
)
as
vllm_model
:
model_path
=
tmp_path
/
(
model_ref
+
".tensors"
)
model_path
=
tmp_path
/
(
model_ref
+
".tensors"
)
key_path
=
tmp_path
/
(
model_ref
+
".key"
)
key_path
=
tmp_path
/
(
model_ref
+
".key"
)
write_keyfile
(
key_path
)
outputs
=
vllm_model
.
generate
(
prompts
,
sampling_params
)
outputs
=
vllm_model
.
generate
(
prompts
,
sampling_params
)
config_for_serializing
=
TensorizerConfig
(
tensorizer_uri
=
model_path
)
config_for_serializing
=
TensorizerConfig
(
serialize_vllm_model
(
vllm_model
.
model
.
llm_engine
,
tensorizer_uri
=
model_path
,
config_for_serializing
,
encryption_keyfile
=
key_path
encryption_key_path
=
key_path
)
)
serialize_vllm_model
(
get_torch_model
(
vllm_model
),
config_for_serializing
)
config_for_deserializing
=
TensorizerConfig
(
tensorizer_uri
=
model_path
,
config_for_deserializing
=
TensorizerConfig
(
tensorizer_uri
=
model_path
,
encryption_keyfile
=
key_path
)
encryption_keyfile
=
key_path
)
...
@@ -145,7 +171,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
...
@@ -145,7 +171,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
with
vllm_runner
(
model_ref
,
)
as
vllm_model
:
with
vllm_runner
(
model_ref
,
)
as
vllm_model
:
model_path
=
tmp_path
/
(
model_ref
+
".tensors"
)
model_path
=
tmp_path
/
(
model_ref
+
".tensors"
)
serialize_vllm_model
(
vllm_model
.
model
.
llm_
engine
,
serialize_vllm_model
(
get_torch_
model
(
v
llm_
model
)
,
TensorizerConfig
(
tensorizer_uri
=
model_path
))
TensorizerConfig
(
tensorizer_uri
=
model_path
))
with
vllm_runner
(
with
vllm_runner
(
...
@@ -180,7 +206,7 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
...
@@ -180,7 +206,7 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
with
vllm_runner
(
model_ref
,
)
as
vllm_model
:
with
vllm_runner
(
model_ref
,
)
as
vllm_model
:
model_path
=
tmp_path
/
(
model_ref
+
".tensors"
)
model_path
=
tmp_path
/
(
model_ref
+
".tensors"
)
serialize_vllm_model
(
vllm_model
.
model
.
llm_
engine
,
serialize_vllm_model
(
get_torch_
model
(
v
llm_
model
)
,
TensorizerConfig
(
tensorizer_uri
=
model_path
))
TensorizerConfig
(
tensorizer_uri
=
model_path
))
model_loader_extra_config
=
{
model_loader_extra_config
=
{
...
@@ -191,18 +217,13 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
...
@@ -191,18 +217,13 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
openai_args
=
[
openai_args
=
[
"--model"
,
model_ref
,
"--dtype"
,
"float16"
,
"--load-format"
,
"--model"
,
model_ref
,
"--dtype"
,
"float16"
,
"--load-format"
,
"tensorizer"
,
"--model-loader-extra-config"
,
"tensorizer"
,
"--model-loader-extra-config"
,
json
.
dumps
(
model_loader_extra_config
),
"--port"
,
"8000"
json
.
dumps
(
model_loader_extra_config
),
]
]
server
=
ServerRunner
.
remote
(
openai_args
)
server
=
RemoteOpenAIServer
(
openai_args
)
assert
ray
.
get
(
server
.
ready
.
remote
())
print
(
"Server ready."
)
print
(
"Server ready."
)
client
=
openai
.
OpenAI
(
client
=
server
.
get_client
()
base_url
=
"http://localhost:8000/v1"
,
api_key
=
"token-abc123"
,
)
completion
=
client
.
completions
.
create
(
model
=
model_ref
,
completion
=
client
.
completions
.
create
(
model
=
model_ref
,
prompt
=
"Hello, my name is"
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
max_tokens
=
5
,
...
@@ -224,7 +245,9 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner):
...
@@ -224,7 +245,9 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner):
model_loader_extra_config
=
TensorizerConfig
(
tensorizer_uri
=
"test"
))
model_loader_extra_config
=
TensorizerConfig
(
tensorizer_uri
=
"test"
))
def
test_tensorizer_with_tp
(
vllm_runner
):
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Requires 2 GPUs"
)
def
test_tensorizer_with_tp_path_without_template
(
vllm_runner
):
with
pytest
.
raises
(
ValueError
):
with
pytest
.
raises
(
ValueError
):
model_ref
=
"EleutherAI/pythia-1.4b"
model_ref
=
"EleutherAI/pythia-1.4b"
tensorized_path
=
f
"s3://tensorized/
{
model_ref
}
/fp16/model.tensors"
tensorized_path
=
f
"s3://tensorized/
{
model_ref
}
/fp16/model.tensors"
...
@@ -238,8 +261,62 @@ def test_tensorizer_with_tp(vllm_runner):
...
@@ -238,8 +261,62 @@ def test_tensorizer_with_tp(vllm_runner):
s3_endpoint
=
"object.ord1.coreweave.com"
,
s3_endpoint
=
"object.ord1.coreweave.com"
,
),
),
tensor_parallel_size
=
2
,
tensor_parallel_size
=
2
,
disable_custom_all_reduce
=
True
,
)
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Requires 2 GPUs"
)
def
test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs
(
vllm_runner
,
tmp_path
):
model_ref
=
"EleutherAI/pythia-1.4b"
# record outputs from un-sharded un-tensorized model
base_model
=
vllm_runner
(
model_ref
,
disable_custom_all_reduce
=
True
,
enforce_eager
=
True
,
)
outputs
=
base_model
.
generate
(
prompts
,
sampling_params
)
base_model
.
model
.
llm_engine
.
model_executor
.
shutdown
()
del
base_model
cleanup
()
ray
.
shutdown
()
# load model with two shards and serialize with encryption
model_path
=
str
(
tmp_path
/
(
model_ref
+
"-%02d.tensors"
))
key_path
=
tmp_path
/
(
model_ref
+
".key"
)
tensorizer_config
=
TensorizerConfig
(
tensorizer_uri
=
model_path
,
encryption_keyfile
=
key_path
,
)
tensorize_vllm_model
(
engine_args
=
EngineArgs
(
model
=
model_ref
,
tensor_parallel_size
=
2
,
disable_custom_all_reduce
=
True
,
enforce_eager
=
True
,
),
tensorizer_config
=
tensorizer_config
,
)
assert
os
.
path
.
isfile
(
model_path
%
0
),
"Serialization subprocess failed"
assert
os
.
path
.
isfile
(
model_path
%
1
),
"Serialization subprocess failed"
cleanup
()
ray
.
shutdown
()
loaded_vllm_model
=
vllm_runner
(
model_ref
,
tensor_parallel_size
=
2
,
load_format
=
"tensorizer"
,
disable_custom_all_reduce
=
True
,
enforce_eager
=
True
,
model_loader_extra_config
=
tensorizer_config
)
deserialized_outputs
=
loaded_vllm_model
.
generate
(
prompts
,
sampling_params
)
assert
outputs
==
deserialized_outputs
def
test_vllm_tensorized_model_has_same_outputs
(
vllm_runner
,
tmp_path
):
def
test_vllm_tensorized_model_has_same_outputs
(
vllm_runner
,
tmp_path
):
model_ref
=
"facebook/opt-125m"
model_ref
=
"facebook/opt-125m"
...
@@ -248,7 +325,7 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
...
@@ -248,7 +325,7 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
with
vllm_runner
(
model_ref
)
as
vllm_model
:
with
vllm_runner
(
model_ref
)
as
vllm_model
:
outputs
=
vllm_model
.
generate
(
prompts
,
sampling_params
)
outputs
=
vllm_model
.
generate
(
prompts
,
sampling_params
)
serialize_vllm_model
(
vllm_model
.
model
.
llm_
engine
,
config
)
serialize_vllm_model
(
get_torch_
model
(
v
llm_
model
)
,
config
)
assert
is_vllm_tensorized
(
config
)
assert
is_vllm_tensorized
(
config
)
...
...
tests/test_sharded_state_loader.py
View file @
6640dc0b
...
@@ -39,7 +39,7 @@ def test_filter_subtensors():
...
@@ -39,7 +39,7 @@ def test_filter_subtensors():
filtered_state_dict
=
ShardedStateLoader
.
_filter_subtensors
(
state_dict
)
filtered_state_dict
=
ShardedStateLoader
.
_filter_subtensors
(
state_dict
)
assert
tuple
(
filtered_state_dict
.
keys
())
==
(
"a"
,
"b"
,
"c"
)
assert
tuple
(
filtered_state_dict
.
keys
())
==
(
"a"
,
"b"
,
"c"
)
for
key
,
tensor
in
filtered_state_dict
.
items
():
for
key
,
tensor
in
filtered_state_dict
.
items
():
# NOTE: don't use `e
u
qal` here, as the tensor might contain NaNs
# NOTE: don't use `eq
u
al` here, as the tensor might contain NaNs
assert
tensor
is
state_dict
[
key
]
assert
tensor
is
state_dict
[
key
]
...
...
tests/utils.py
View file @
6640dc0b
...
@@ -4,57 +4,109 @@ import sys
...
@@ -4,57 +4,109 @@ import sys
import
time
import
time
import
warnings
import
warnings
from
contextlib
import
contextmanager
from
contextlib
import
contextmanager
from
typing
import
List
import
openai
import
ray
import
ray
import
requests
import
requests
from
vllm.distributed
import
(
ensure_model_parallel_initialized
,
from
vllm.distributed
import
(
ensure_model_parallel_initialized
,
init_distributed_environment
)
init_distributed_environment
)
from
vllm.entrypoints.openai.cli_args
import
make_arg_parser
from
vllm.utils
import
get_open_port
from
vllm.utils
import
get_open_port
# Path to root of repository so that utilities can be imported by ray workers
# Path to root of repository so that utilities can be imported by ray workers
VLLM_PATH
=
os
.
path
.
abspath
(
os
.
path
.
join
(
__file__
,
os
.
pardir
,
os
.
pardir
))
VLLM_PATH
=
os
.
path
.
abspath
(
os
.
path
.
join
(
__file__
,
os
.
pardir
,
os
.
pardir
))
@
ray
.
remote
(
num_gpus
=
1
)
class
RemoteOpenAIServer
:
class
ServerRunner
:
DUMMY_API_KEY
=
"token-abc123"
# vLLM's OpenAI server does not need API key
MAX_SERVER_START_WAIT_S
=
600
# wait for server to start for 60 seconds
MAX_SERVER_START_WAIT_S
=
600
# wait for server to start for 60 seconds
def
__init__
(
self
,
args
):
@
ray
.
remote
(
num_gpus
=
1
)
env
=
os
.
environ
.
copy
()
class
_RemoteRunner
:
env
[
"PYTHONUNBUFFERED"
]
=
"1"
self
.
proc
=
subprocess
.
Popen
(
def
__init__
(
self
,
cli_args
:
List
[
str
],
*
,
wait_url
:
str
,
[
sys
.
executable
,
"-m"
,
"vllm.entrypoints.openai.api_server"
]
+
wait_timeout
:
float
)
->
None
:
args
,
env
=
os
.
environ
.
copy
()
env
=
env
,
env
[
"PYTHONUNBUFFERED"
]
=
"1"
stdout
=
sys
.
stdout
,
self
.
proc
=
subprocess
.
Popen
(
stderr
=
sys
.
stderr
,
[
sys
.
executable
,
"-m"
,
"vllm.entrypoints.openai.api_server"
,
*
cli_args
],
env
=
env
,
stdout
=
sys
.
stdout
,
stderr
=
sys
.
stderr
,
)
self
.
_wait_for_server
(
url
=
wait_url
,
timeout
=
wait_timeout
)
def
ready
(
self
):
return
True
def
_wait_for_server
(
self
,
*
,
url
:
str
,
timeout
:
float
):
# run health check
start
=
time
.
time
()
while
True
:
try
:
if
requests
.
get
(
url
).
status_code
==
200
:
break
except
Exception
as
err
:
if
self
.
proc
.
poll
()
is
not
None
:
raise
RuntimeError
(
"Server exited unexpectedly."
)
from
err
time
.
sleep
(
0.5
)
if
time
.
time
()
-
start
>
timeout
:
raise
RuntimeError
(
"Server failed to start in time."
)
from
err
def
__del__
(
self
):
if
hasattr
(
self
,
"proc"
):
self
.
proc
.
terminate
()
def
__init__
(
self
,
cli_args
:
List
[
str
],
*
,
auto_port
:
bool
=
True
)
->
None
:
if
auto_port
:
if
"-p"
in
cli_args
or
"--port"
in
cli_args
:
raise
ValueError
(
"You have manually specified the port"
"when `auto_port=True`."
)
cli_args
=
cli_args
+
[
"--port"
,
str
(
get_open_port
())]
parser
=
make_arg_parser
()
args
=
parser
.
parse_args
(
cli_args
)
self
.
host
=
str
(
args
.
host
or
'localhost'
)
self
.
port
=
int
(
args
.
port
)
self
.
_runner
=
self
.
_RemoteRunner
.
remote
(
cli_args
,
wait_url
=
self
.
url_for
(
"health"
),
wait_timeout
=
self
.
MAX_SERVER_START_WAIT_S
)
self
.
_wait_until_ready
()
@
property
def
url_root
(
self
)
->
str
:
return
f
"http://
{
self
.
host
}
:
{
self
.
port
}
"
def
url_for
(
self
,
*
parts
:
str
)
->
str
:
return
self
.
url_root
+
"/"
+
"/"
.
join
(
parts
)
def
_wait_until_ready
(
self
)
->
None
:
ray
.
get
(
self
.
_runner
.
ready
.
remote
())
def
get_client
(
self
):
return
openai
.
OpenAI
(
base_url
=
self
.
url_for
(
"v1"
),
api_key
=
self
.
DUMMY_API_KEY
,
)
def
get_async_client
(
self
):
return
openai
.
AsyncOpenAI
(
base_url
=
self
.
url_for
(
"v1"
),
api_key
=
self
.
DUMMY_API_KEY
,
)
)
self
.
_wait_for_server
()
def
ready
(
self
):
return
True
def
_wait_for_server
(
self
):
# run health check
start
=
time
.
time
()
while
True
:
try
:
if
requests
.
get
(
"http://localhost:8000/health"
).
status_code
==
200
:
break
except
Exception
as
err
:
if
self
.
proc
.
poll
()
is
not
None
:
raise
RuntimeError
(
"Server exited unexpectedly."
)
from
err
time
.
sleep
(
0.5
)
if
time
.
time
()
-
start
>
self
.
MAX_SERVER_START_WAIT_S
:
raise
RuntimeError
(
"Server failed to start in time."
)
from
err
def
__del__
(
self
):
if
hasattr
(
self
,
"proc"
):
self
.
proc
.
terminate
()
def
init_test_distributed_environment
(
def
init_test_distributed_environment
(
...
...
tests/worker/test_model_runner.py
View file @
6640dc0b
import
pytest
import
pytest
import
torch
import
torch
from
vllm.distributed.parallel_state
import
init_distributed_environment
from
vllm.distributed.parallel_state
import
(
ensure_model_parallel_initialized
,
init_distributed_environment
)
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
SamplingParams
,
SequenceData
,
SequenceGroupMetadata
from
vllm.sequence
import
SamplingParams
,
SequenceData
,
SequenceGroupMetadata
...
@@ -292,6 +293,7 @@ def distributed_init():
...
@@ -292,6 +293,7 @@ def distributed_init():
rank
=
0
,
rank
=
0
,
distributed_init_method
=
f
"tcp://127.0.0.1:
{
get_open_port
()
}
"
,
distributed_init_method
=
f
"tcp://127.0.0.1:
{
get_open_port
()
}
"
,
local_rank
=
0
)
local_rank
=
0
)
ensure_model_parallel_initialized
(
1
,
1
)
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
list
(
range
(
2
,
128
)))
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
list
(
range
(
2
,
128
)))
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment