Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
af7f4372
Commit
af7f4372
authored
Sep 03, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.5.5' into v0.5.5-dtk24.04.1
parents
5e19cdef
09c77926
Changes
448
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1012 additions
and
828 deletions
+1012
-828
tests/entrypoints/openai/rpc/__init__.py
tests/entrypoints/openai/rpc/__init__.py
+0
-0
tests/entrypoints/openai/rpc/test_zmq_client.py
tests/entrypoints/openai/rpc/test_zmq_client.py
+120
-0
tests/entrypoints/openai/test_accuracy.py
tests/entrypoints/openai/test_accuracy.py
+55
-0
tests/entrypoints/openai/test_audio.py
tests/entrypoints/openai/test_audio.py
+253
-0
tests/entrypoints/openai/test_basic.py
tests/entrypoints/openai/test_basic.py
+0
-9
tests/entrypoints/openai/test_chat.py
tests/entrypoints/openai/test_chat.py
+83
-1
tests/entrypoints/openai/test_completion.py
tests/entrypoints/openai/test_completion.py
+39
-9
tests/entrypoints/openai/test_disable_mp.py
tests/entrypoints/openai/test_disable_mp.py
+0
-715
tests/entrypoints/openai/test_encoder_decoder.py
tests/entrypoints/openai/test_encoder_decoder.py
+50
-0
tests/entrypoints/openai/test_metrics.py
tests/entrypoints/openai/test_metrics.py
+179
-0
tests/entrypoints/openai/test_mp_api_server.py
tests/entrypoints/openai/test_mp_api_server.py
+40
-0
tests/entrypoints/openai/test_oot_registration.py
tests/entrypoints/openai/test_oot_registration.py
+42
-70
tests/entrypoints/openai/test_prompt_validation.py
tests/entrypoints/openai/test_prompt_validation.py
+22
-0
tests/entrypoints/openai/test_run_batch.py
tests/entrypoints/openai/test_run_batch.py
+50
-2
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_chat.py
+0
-1
tests/entrypoints/openai/test_shutdown.py
tests/entrypoints/openai/test_shutdown.py
+47
-0
tests/kernels/quant_utils.py
tests/kernels/quant_utils.py
+23
-12
tests/kernels/test_activation.py
tests/kernels/test_activation.py
+5
-5
tests/kernels/test_attention.py
tests/kernels/test_attention.py
+2
-2
tests/kernels/test_attention_selector.py
tests/kernels/test_attention_selector.py
+2
-2
No files found.
Too many changes to show.
To preserve performance only
448 of 448+
files are displayed.
Plain diff
Email patch
tests/entrypoints/openai/rpc/__init__.py
0 → 100644
View file @
af7f4372
tests/entrypoints/openai/rpc/test_zmq_client.py
0 → 100644
View file @
af7f4372
import
asyncio
import
tempfile
import
unittest
import
unittest.mock
import
uuid
import
pytest
import
pytest_asyncio
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.entrypoints.openai.rpc.client
import
(
AsyncEngineRPCClient
,
RPCClientClosedError
)
from
vllm.entrypoints.openai.rpc.server
import
AsyncEngineRPCServer
@
pytest
.
fixture
(
scope
=
"function"
)
def
tmp_socket
():
with
tempfile
.
TemporaryDirectory
()
as
td
:
yield
f
"ipc://
{
td
}
/
{
uuid
.
uuid4
()
}
"
@
pytest_asyncio
.
fixture
(
scope
=
"function"
)
async
def
dummy_server
(
tmp_socket
,
monkeypatch
):
dummy_engine
=
unittest
.
mock
.
AsyncMock
()
def
dummy_engine_builder
(
*
args
,
**
kwargs
):
return
dummy_engine
with
monkeypatch
.
context
()
as
m
:
m
.
setattr
(
AsyncLLMEngine
,
"from_engine_args"
,
dummy_engine_builder
)
server
=
AsyncEngineRPCServer
(
None
,
None
,
rpc_path
=
tmp_socket
)
loop
=
asyncio
.
get_running_loop
()
server_task
=
loop
.
create_task
(
server
.
run_server_loop
())
try
:
yield
server
finally
:
server_task
.
cancel
()
server
.
cleanup
()
@
pytest_asyncio
.
fixture
(
scope
=
"function"
)
async
def
client
(
tmp_socket
):
client
=
AsyncEngineRPCClient
(
rpc_path
=
tmp_socket
)
# Sanity check: the server is connected
await
client
.
_wait_for_server_rpc
()
try
:
yield
client
finally
:
client
.
close
()
@
pytest
.
mark
.
asyncio
async
def
test_client_data_methods_use_timeouts
(
monkeypatch
,
dummy_server
,
client
:
AsyncEngineRPCClient
):
with
monkeypatch
.
context
()
as
m
:
# Make the server _not_ reply with a model config
m
.
setattr
(
dummy_server
,
"get_config"
,
lambda
x
:
None
)
m
.
setattr
(
client
,
"_data_timeout"
,
10
)
# And ensure the task completes anyway
# (client.setup() invokes server.get_config())
client_task
=
asyncio
.
get_running_loop
().
create_task
(
client
.
setup
())
with
pytest
.
raises
(
TimeoutError
,
match
=
"Server didn't reply within"
):
await
asyncio
.
wait_for
(
client_task
,
timeout
=
0.05
)
@
pytest
.
mark
.
asyncio
async
def
test_client_aborts_use_timeouts
(
monkeypatch
,
dummy_server
,
client
:
AsyncEngineRPCClient
):
with
monkeypatch
.
context
()
as
m
:
# Hang all abort requests
m
.
setattr
(
dummy_server
,
"abort"
,
lambda
x
:
None
)
m
.
setattr
(
client
,
"_data_timeout"
,
10
)
# The client should suppress timeouts on `abort`s
# and return normally, assuming the server will eventually
# abort the request.
client_task
=
asyncio
.
get_running_loop
().
create_task
(
client
.
abort
(
"test request id"
))
await
asyncio
.
wait_for
(
client_task
,
timeout
=
0.05
)
@
pytest
.
mark
.
asyncio
async
def
test_client_data_methods_reraise_exceptions
(
monkeypatch
,
dummy_server
,
client
:
AsyncEngineRPCClient
):
with
monkeypatch
.
context
()
as
m
:
# Make the server raise some random exception
exception
=
RuntimeError
(
"Client test exception"
)
def
raiser
():
raise
exception
m
.
setattr
(
dummy_server
.
engine
,
"get_model_config"
,
raiser
)
m
.
setattr
(
client
,
"_data_timeout"
,
10
)
client_task
=
asyncio
.
get_running_loop
().
create_task
(
client
.
setup
())
# And ensure the task completes, raising the exception
with
pytest
.
raises
(
RuntimeError
,
match
=
str
(
exception
)):
await
asyncio
.
wait_for
(
client_task
,
timeout
=
0.05
)
@
pytest
.
mark
.
asyncio
async
def
test_client_errors_after_closing
(
monkeypatch
,
dummy_server
,
client
:
AsyncEngineRPCClient
):
client
.
close
()
# Healthchecks and generate requests will fail with explicit errors
with
pytest
.
raises
(
RPCClientClosedError
):
await
client
.
check_health
()
with
pytest
.
raises
(
RPCClientClosedError
):
async
for
_
in
client
.
generate
(
None
,
None
,
None
):
pass
# But no-ops like aborting will pass
await
client
.
abort
(
"test-request-id"
)
await
client
.
do_log_stats
()
tests/entrypoints/openai/test_accuracy.py
0 → 100644
View file @
af7f4372
"""
This file test accuracy of the vLLM server via LMEval.
It uses local-completions, which interacts with vLLM
through the OAI API with N concurrent connections.
This simulates real work usage of the API and makes
sure that the zmq frontend mp RPC message passing and
AsyncLLMEngine are working correctly.
"""
import
lm_eval
import
pytest
from
...utils
import
RemoteOpenAIServer
MODEL_NAME
=
"Qwen/Qwen2-1.5B-Instruct"
NUM_CONCURRENT
=
500
TASK
=
"gsm8k"
FILTER
=
"exact_match,strict-match"
RTOL
=
0.03
EXPECTED_VALUE
=
0.58
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
"--max-model-len"
,
"4096"
,
"--enable-chunked-prefill"
,
"--disable-log-requests"
,
"--enforce-eager"
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
server_data
(
server
):
return
{
"url"
:
f
"
{
server
.
url_for
(
'v1'
)
}
/completions"
,
}
def
test_lm_eval_accuracy
(
server_data
):
model_args
=
(
f
"model=
{
MODEL_NAME
}
,"
f
"base_url=
{
server_data
[
'url'
]
}
,"
f
"num_concurrent=
{
NUM_CONCURRENT
}
,tokenized_requests=False"
)
results
=
lm_eval
.
simple_evaluate
(
model
=
"local-completions"
,
model_args
=
model_args
,
tasks
=
TASK
,
)
measured_value
=
results
[
"results"
][
TASK
][
FILTER
]
assert
(
measured_value
-
RTOL
<
EXPECTED_VALUE
and
measured_value
+
RTOL
>
EXPECTED_VALUE
),
f
"Expected:
{
EXPECTED_VALUE
}
| Measured:
{
measured_value
}
"
tests/entrypoints/openai/test_audio.py
0 → 100644
View file @
af7f4372
from
typing
import
Dict
,
List
import
openai
import
pytest
from
vllm.assets.audio
import
AudioAsset
from
vllm.multimodal.utils
import
encode_audio_base64
,
fetch_audio
from
...utils
import
RemoteOpenAIServer
MODEL_NAME
=
"fixie-ai/ultravox-v0_3"
TEST_AUDIO_URLS
=
[
AudioAsset
(
"winning_call"
).
url
,
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"4096"
,
"--enforce-eager"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
(
server
):
return
server
.
get_async_client
()
@
pytest
.
fixture
(
scope
=
"session"
)
def
base64_encoded_audio
()
->
Dict
[
str
,
str
]:
return
{
audio_url
:
encode_audio_base64
(
*
fetch_audio
(
audio_url
))
for
audio_url
in
TEST_AUDIO_URLS
}
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
TEST_AUDIO_URLS
)
async
def
test_single_chat_session_audio
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
audio_url
:
str
):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"audio_url"
,
"audio_url"
:
{
"url"
:
audio_url
}
},
{
"type"
:
"text"
,
"text"
:
"What's happening in this audio?"
},
],
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
)
assert
len
(
chat_completion
.
choices
)
==
1
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
10
,
prompt_tokens
=
202
,
total_tokens
=
212
)
message
=
choice
.
message
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
10
assert
message
.
role
==
"assistant"
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
message
.
content
})
# test multi-turn dialogue
messages
.
append
({
"role"
:
"user"
,
"content"
:
"express your result in json"
})
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
)
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
TEST_AUDIO_URLS
)
async
def
test_single_chat_session_audio_base64encoded
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
audio_url
:
str
,
base64_encoded_audio
:
Dict
[
str
,
str
]):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"audio_url"
,
"audio_url"
:
{
"url"
:
f
"data:audio/wav;base64,
{
base64_encoded_audio
[
audio_url
]
}
"
}
},
{
"type"
:
"text"
,
"text"
:
"What's happening in this audio?"
},
],
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
)
assert
len
(
chat_completion
.
choices
)
==
1
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
10
,
prompt_tokens
=
202
,
total_tokens
=
212
)
message
=
choice
.
message
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
10
assert
message
.
role
==
"assistant"
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
message
.
content
})
# test multi-turn dialogue
messages
.
append
({
"role"
:
"user"
,
"content"
:
"express your result in json"
})
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
)
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
TEST_AUDIO_URLS
)
async
def
test_chat_streaming_audio
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
audio_url
:
str
):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"audio_url"
,
"audio_url"
:
{
"url"
:
audio_url
}
},
{
"type"
:
"text"
,
"text"
:
"What's happening in this audio?"
},
],
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
)
output
=
chat_completion
.
choices
[
0
].
message
.
content
stop_reason
=
chat_completion
.
choices
[
0
].
finish_reason
# test streaming
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
stream
=
True
,
)
chunks
:
List
[
str
]
=
[]
finish_reason_count
=
0
async
for
chunk
in
stream
:
delta
=
chunk
.
choices
[
0
].
delta
if
delta
.
role
:
assert
delta
.
role
==
"assistant"
if
delta
.
content
:
chunks
.
append
(
delta
.
content
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finish_reason_count
+=
1
# finish reason should only return in last block
assert
finish_reason_count
==
1
assert
chunk
.
choices
[
0
].
finish_reason
==
stop_reason
assert
delta
.
content
assert
""
.
join
(
chunks
)
==
output
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
TEST_AUDIO_URLS
)
async
def
test_multi_audio_input
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
audio_url
:
str
):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"audio_url"
,
"audio_url"
:
{
"url"
:
audio_url
}
},
{
"type"
:
"audio_url"
,
"audio_url"
:
{
"url"
:
audio_url
}
},
{
"type"
:
"text"
,
"text"
:
"What's happening in this audio?"
},
],
}]
with
pytest
.
raises
(
openai
.
BadRequestError
):
# test multi-audio input
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
)
# the server should still work afterwards
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
completion
=
completion
.
choices
[
0
].
text
assert
completion
is
not
None
and
len
(
completion
)
>=
0
tests/entrypoints/openai/test_basic.py
View file @
af7f4372
...
...
@@ -50,12 +50,3 @@ async def test_check_health(client: openai.AsyncOpenAI):
response
=
requests
.
get
(
base_url
+
"/health"
)
assert
response
.
status_code
==
HTTPStatus
.
OK
@
pytest
.
mark
.
asyncio
async
def
test_log_metrics
(
client
:
openai
.
AsyncOpenAI
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
response
=
requests
.
get
(
base_url
+
"/metrics"
)
assert
response
.
status_code
==
HTTPStatus
.
OK
tests/entrypoints/openai/test_chat.py
View file @
af7f4372
# imports for guided decoding tests
import
json
import
re
from
typing
import
List
from
typing
import
Dict
,
List
,
Optional
import
jsonschema
import
openai
# use the official client for correctness check
...
...
@@ -174,6 +174,88 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name, prompt_logprobs"
,
[(
MODEL_NAME
,
1
),
(
MODEL_NAME
,
0
),
(
MODEL_NAME
,
-
1
),
(
MODEL_NAME
,
None
)],
)
async
def
test_prompt_logprobs_chat
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
prompt_logprobs
:
Optional
[
int
]):
params
:
Dict
=
{
"messages"
:
[{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"user"
,
"content"
:
"Who won the world series in 2020?"
},
{
"role"
:
"assistant"
,
"content"
:
"The Los Angeles Dodgers won the World Series in 2020."
},
{
"role"
:
"user"
,
"content"
:
"Where was it played?"
}],
"model"
:
model_name
}
if
prompt_logprobs
is
not
None
:
params
[
"extra_body"
]
=
{
"prompt_logprobs"
:
prompt_logprobs
}
if
prompt_logprobs
is
not
None
and
prompt_logprobs
<
0
:
with
pytest
.
raises
(
BadRequestError
):
await
client
.
chat
.
completions
.
create
(
**
params
)
else
:
completion
=
await
client
.
chat
.
completions
.
create
(
**
params
)
if
prompt_logprobs
is
not
None
:
assert
completion
.
prompt_logprobs
is
not
None
assert
len
(
completion
.
prompt_logprobs
)
>
0
else
:
assert
completion
.
prompt_logprobs
is
None
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
],
)
async
def
test_more_than_one_prompt_logprobs_chat
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
params
:
Dict
=
{
"messages"
:
[{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"user"
,
"content"
:
"Who won the world series in 2020?"
},
{
"role"
:
"assistant"
,
"content"
:
"The Los Angeles Dodgers won the World Series in 2020."
},
{
"role"
:
"user"
,
"content"
:
"Where was it played?"
}],
"model"
:
model_name
,
"extra_body"
:
{
"prompt_logprobs"
:
1
}
}
completion_1
=
await
client
.
chat
.
completions
.
create
(
**
params
)
params
[
"extra_body"
]
=
{
"prompt_logprobs"
:
2
}
completion_2
=
await
client
.
chat
.
completions
.
create
(
**
params
)
assert
len
(
completion_1
.
prompt_logprobs
[
3
])
==
1
assert
len
(
completion_2
.
prompt_logprobs
[
3
])
==
2
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
...
...
tests/entrypoints/openai/test_completion.py
View file @
af7f4372
...
...
@@ -3,7 +3,7 @@ import json
import
re
import
shutil
from
tempfile
import
TemporaryDirectory
from
typing
import
List
from
typing
import
Dict
,
List
,
Optional
import
jsonschema
import
openai
# use the official client for correctness check
...
...
@@ -87,15 +87,13 @@ def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
default_server_args
):
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
[
""
,
"--disable-frontend-multiprocessing"
])
def
client
(
default_server_args
,
request
):
if
request
.
param
:
default_server_args
.
append
(
request
.
param
)
with
RemoteOpenAIServer
(
MODEL_NAME
,
default_server_args
)
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
(
server
):
return
server
.
get_async_client
()
yield
remote_server
.
get_async_client
()
@
pytest
.
mark
.
asyncio
...
...
@@ -132,6 +130,7 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str,
temperature
=
0.0
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
1
assert
completion
.
choices
[
0
].
prompt_logprobs
is
None
@
pytest
.
mark
.
asyncio
...
...
@@ -269,6 +268,37 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
assert
len
(
completion
.
choices
[
0
].
text
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name, prompt_logprobs"
,
[(
MODEL_NAME
,
-
1
),
(
MODEL_NAME
,
0
),
(
MODEL_NAME
,
1
),
(
MODEL_NAME
,
None
)])
async
def
test_prompt_logprobs_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
prompt_logprobs
:
Optional
[
int
]):
params
:
Dict
=
{
"prompt"
:
[
"A robot may not injure another robot"
,
"My name is"
],
"model"
:
model_name
,
}
if
prompt_logprobs
is
not
None
:
params
[
"extra_body"
]
=
{
"prompt_logprobs"
:
prompt_logprobs
}
if
prompt_logprobs
is
not
None
and
prompt_logprobs
<
0
:
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
**
params
)
else
:
completion
=
await
client
.
completions
.
create
(
**
params
)
if
prompt_logprobs
is
not
None
:
assert
completion
.
choices
[
0
].
prompt_logprobs
is
not
None
assert
len
(
completion
.
choices
[
0
].
prompt_logprobs
)
>
0
assert
completion
.
choices
[
1
].
prompt_logprobs
is
not
None
assert
len
(
completion
.
choices
[
1
].
prompt_logprobs
)
>
0
else
:
assert
completion
.
choices
[
0
].
prompt_logprobs
is
None
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
...
...
tests/entrypoints/openai/test_disable_mp.py
deleted
100644 → 0
View file @
5e19cdef
"""
Repeat of tests in test_completion.py with the non-mp backend.
"""
# imports for guided decoding tests
import
json
import
re
import
shutil
from
tempfile
import
TemporaryDirectory
from
typing
import
List
import
jsonschema
import
openai
# use the official client for correctness check
import
pytest
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
openai
import
BadRequestError
from
transformers
import
AutoTokenizer
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
...utils
import
RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
# technically these adapters use a different base model,
# but we're not testing generation quality here
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
PA_NAME
=
"swapnilbp/llama_tweet_ptune"
# if PA_NAME changes, PA_NUM_VIRTUAL_TOKENS might also
# need to change to match the prompt adapter
PA_NUM_VIRTUAL_TOKENS
=
8
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_added_tokens_files
(
zephyr_lora_files
):
tmp_dir
=
TemporaryDirectory
()
tmp_model_dir
=
f
"
{
tmp_dir
.
name
}
/zephyr"
shutil
.
copytree
(
zephyr_lora_files
,
tmp_model_dir
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_NAME
)
# Copy tokenizer to adapter and add some unique tokens
# 32000, 32001, 32002
added
=
tokenizer
.
add_tokens
([
"vllm1"
,
"vllm2"
,
"vllm3"
],
special_tokens
=
True
)
assert
added
==
3
tokenizer
.
save_pretrained
(
tmp_model_dir
)
yield
tmp_model_dir
tmp_dir
.
cleanup
()
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_pa_files
():
return
snapshot_download
(
repo_id
=
PA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
default_server_args
(
zephyr_lora_files
,
zephyr_lora_added_tokens_files
,
zephyr_pa_files
):
return
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--max-num-seqs"
,
"128"
,
"--enforce-eager"
,
# lora config
"--enable-lora"
,
"--lora-modules"
,
f
"zephyr-lora=
{
zephyr_lora_files
}
"
,
f
"zephyr-lora2=
{
zephyr_lora_added_tokens_files
}
"
,
"--max-lora-rank"
,
"64"
,
"--max-cpu-loras"
,
"2"
,
# pa config
"--enable-prompt-adapter"
,
"--prompt-adapters"
,
f
"zephyr-pa=
{
zephyr_pa_files
}
"
,
f
"zephyr-pa2=
{
zephyr_pa_files
}
"
,
"--max-prompt-adapters"
,
"2"
,
"--max-prompt-adapter-token"
,
"128"
,
"--disable-frontend-multiprocessing"
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
default_server_args
):
with
RemoteOpenAIServer
(
MODEL_NAME
,
default_server_args
)
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
(
server
):
return
server
.
get_async_client
()
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras, then test prompt adapters
"model_name,num_virtual_tokens"
,
[(
MODEL_NAME
,
0
),
(
"zephyr-lora"
,
0
),
(
"zephyr-lora2"
,
0
),
(
"zephyr-pa"
,
PA_NUM_VIRTUAL_TOKENS
),
(
"zephyr-pa2"
,
PA_NUM_VIRTUAL_TOKENS
)],
)
async
def
test_single_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
num_virtual_tokens
:
int
):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
temperature
=
0.0
)
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
choice
=
completion
.
choices
[
0
]
assert
len
(
choice
.
text
)
>=
5
assert
choice
.
finish_reason
==
"length"
assert
completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
5
,
prompt_tokens
=
6
+
num_virtual_tokens
,
total_tokens
=
11
+
num_virtual_tokens
)
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
1
@
pytest
.
mark
.
asyncio
async
def
test_added_lora_tokens
(
client
:
openai
.
AsyncOpenAI
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
"zephyr-lora2"
,
prompt
=
[
0
,
0
,
32000
,
32001
,
32002
],
echo
=
True
,
max_tokens
=
5
,
temperature
=
0.0
,
)
# Added tokens should appear in tokenized prompt
assert
completion
.
choices
[
0
].
text
.
startswith
(
"<unk><unk>vllm1vllm2vllm3"
)
@
pytest
.
mark
.
asyncio
async
def
test_added_lora_tokens_base_model
(
client
:
openai
.
AsyncOpenAI
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
32000
,
32001
,
32002
],
echo
=
True
,
max_tokens
=
5
,
temperature
=
0.0
,
)
# Added tokens should not appear in tokenized prompt
assert
"vllm"
not
in
completion
.
choices
[
0
].
text
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras, then test prompt adapters
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
,
"zephyr-pa"
,
"zephyr-pa2"
],
)
async
def
test_no_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
None
,
)
choice
=
completion
.
choices
[
0
]
assert
choice
.
logprobs
is
None
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# just test 1 lora and 1 pa hereafter
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_zero_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
0
,
)
choice
=
completion
.
choices
[
0
]
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
.
token_logprobs
is
not
None
assert
choice
.
logprobs
.
top_logprobs
is
not
None
assert
len
(
choice
.
logprobs
.
top_logprobs
[
0
])
==
1
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_some_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
5
,
)
choice
=
completion
.
choices
[
0
]
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
.
token_logprobs
is
not
None
assert
choice
.
logprobs
.
top_logprobs
is
not
None
assert
5
<=
len
(
choice
.
logprobs
.
top_logprobs
[
0
])
<=
6
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_too_many_completion_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
with
pytest
.
raises
(
(
openai
.
BadRequestError
,
openai
.
APIError
)):
# test using token IDs
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs
=
21
,
)
...
with
pytest
.
raises
(
(
openai
.
BadRequestError
,
openai
.
APIError
)):
# test using token IDs
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs
=
30
,
stream
=
True
,
)
async
for
chunk
in
stream
:
...
# the server should still work afterwards
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_completion_streaming
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
prompt
=
"What is an LLM?"
single_completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
)
single_output
=
single_completion
.
choices
[
0
].
text
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
)
chunks
:
List
[
str
]
=
[]
finish_reason_count
=
0
async
for
chunk
in
stream
:
chunks
.
append
(
chunk
.
choices
[
0
].
text
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finish_reason_count
+=
1
# finish reason should only return in last block
assert
finish_reason_count
==
1
assert
chunk
.
choices
[
0
].
finish_reason
==
"length"
assert
chunk
.
choices
[
0
].
text
assert
""
.
join
(
chunks
)
==
single_output
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_completion_stream_options
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
prompt
=
"What is the capital of France?"
# Test stream=True, stream_options=
# {"include_usage": False, "continuous_usage_stats": False}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
False
,
"continuous_usage_stats"
:
False
,
})
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
None
# Test stream=True, stream_options=
# {"include_usage": False, "continuous_usage_stats": True}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
False
,
"continuous_usage_stats"
:
True
,
})
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
None
# Test stream=True, stream_options=
# {"include_usage": True, "continuous_usage_stats": False}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
False
,
})
async
for
chunk
in
stream
:
if
chunk
.
choices
[
0
].
finish_reason
is
None
:
assert
chunk
.
usage
is
None
else
:
assert
chunk
.
usage
is
None
final_chunk
=
await
stream
.
__anext__
()
assert
final_chunk
.
usage
is
not
None
assert
final_chunk
.
usage
.
prompt_tokens
>
0
assert
final_chunk
.
usage
.
completion_tokens
>
0
assert
final_chunk
.
usage
.
total_tokens
==
(
final_chunk
.
usage
.
prompt_tokens
+
final_chunk
.
usage
.
completion_tokens
)
assert
final_chunk
.
choices
==
[]
# Test stream=True, stream_options=
# {"include_usage": True, "continuous_usage_stats": True}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
True
,
})
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
not
None
assert
chunk
.
usage
.
prompt_tokens
>
0
assert
chunk
.
usage
.
completion_tokens
>
0
assert
chunk
.
usage
.
total_tokens
==
(
chunk
.
usage
.
prompt_tokens
+
chunk
.
usage
.
completion_tokens
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
final_chunk
=
await
stream
.
__anext__
()
assert
final_chunk
.
usage
is
not
None
assert
final_chunk
.
usage
.
prompt_tokens
>
0
assert
final_chunk
.
usage
.
completion_tokens
>
0
assert
final_chunk
.
usage
.
total_tokens
==
(
final_chunk
.
usage
.
prompt_tokens
+
final_chunk
.
usage
.
completion_tokens
)
assert
final_chunk
.
choices
==
[]
# Test stream=False, stream_options=
# {"include_usage": None}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
None
})
# Test stream=False, stream_options=
# {"include_usage": True}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
True
})
# Test stream=False, stream_options=
# {"continuous_usage_stats": None}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"continuous_usage_stats"
:
None
})
# Test stream=False, stream_options=
# {"continuous_usage_stats": True}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"continuous_usage_stats"
:
True
})
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_batch_completions
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test both text and token IDs
for
prompts
in
([
"Hello, my name is"
]
*
2
,
[[
0
,
0
,
0
,
0
,
0
]]
*
2
):
# test simple list
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompts
,
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
batch
.
choices
)
==
2
assert
batch
.
choices
[
0
].
text
==
batch
.
choices
[
1
].
text
# test n = 2
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompts
,
n
=
2
,
max_tokens
=
5
,
temperature
=
0.0
,
extra_body
=
dict
(
# NOTE: this has to be true for n > 1 in vLLM, but not necessary
# for official client.
use_beam_search
=
True
),
)
assert
len
(
batch
.
choices
)
==
4
assert
batch
.
choices
[
0
].
text
!=
batch
.
choices
[
1
].
text
,
"beam search should be different"
assert
batch
.
choices
[
0
].
text
==
batch
.
choices
[
2
].
text
,
"two copies of the same prompt should be the same"
assert
batch
.
choices
[
1
].
text
==
batch
.
choices
[
3
].
text
,
"two copies of the same prompt should be the same"
# test streaming
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompts
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
)
texts
=
[
""
]
*
2
async
for
chunk
in
batch
:
assert
len
(
chunk
.
choices
)
==
1
choice
=
chunk
.
choices
[
0
]
texts
[
choice
.
index
]
+=
choice
.
text
assert
texts
[
0
]
==
texts
[
1
]
@
pytest
.
mark
.
asyncio
async
def
test_logits_bias
(
client
:
openai
.
AsyncOpenAI
):
prompt
=
"Hello, my name is"
max_tokens
=
5
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# Test exclusive selection
token_id
=
1000
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
logit_bias
=
{
str
(
token_id
):
100
},
seed
=
42
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
5
response_tokens
=
tokenizer
(
completion
.
choices
[
0
].
text
,
add_special_tokens
=
False
)[
"input_ids"
]
expected_tokens
=
tokenizer
(
tokenizer
.
decode
([
token_id
]
*
5
),
add_special_tokens
=
False
)[
"input_ids"
]
assert
all
([
response
==
expected
for
response
,
expected
in
zip
(
response_tokens
,
expected_tokens
)
])
# Test ban
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
)
response_tokens
=
tokenizer
(
completion
.
choices
[
0
].
text
,
add_special_tokens
=
False
)[
"input_ids"
]
first_response
=
completion
.
choices
[
0
].
text
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
logit_bias
=
{
str
(
token
):
-
100
for
token
in
response_tokens
},
)
assert
first_response
!=
completion
.
choices
[
0
].
text
@
pytest
.
mark
.
asyncio
async
def
test_allowed_token_ids
(
client
:
openai
.
AsyncOpenAI
):
prompt
=
"Hello, my name is"
max_tokens
=
1
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# Test exclusive selection
allowed_ids
=
[
21555
,
21557
,
21558
]
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
seed
=
42
,
extra_body
=
dict
(
allowed_token_ids
=
allowed_ids
),
logprobs
=
1
,
)
response_tokens
=
completion
.
choices
[
0
].
logprobs
.
tokens
assert
len
(
response_tokens
)
==
1
assert
tokenizer
.
convert_tokens_to_ids
(
response_tokens
)[
0
]
in
allowed_ids
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_json_completion
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_json_schema
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
f
"Give an example JSON for an employee profile "
f
"that fits this schema:
{
sample_json_schema
}
"
,
n
=
3
,
temperature
=
1.0
,
max_tokens
=
500
,
extra_body
=
dict
(
guided_json
=
sample_json_schema
,
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
len
(
completion
.
choices
)
==
3
for
i
in
range
(
3
):
output_json
=
json
.
loads
(
completion
.
choices
[
i
].
text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
sample_json_schema
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_regex_completion
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_regex
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
f
"Give an example IPv4 address with this regex:
{
sample_regex
}
"
,
n
=
3
,
temperature
=
1.0
,
max_tokens
=
20
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
len
(
completion
.
choices
)
==
3
for
i
in
range
(
3
):
assert
re
.
fullmatch
(
sample_regex
,
completion
.
choices
[
i
].
text
)
is
not
None
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_choice_completion
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_guided_choice
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"The best language for type-safe systems programming is "
,
n
=
2
,
temperature
=
1.0
,
max_tokens
=
10
,
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
,
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
len
(
completion
.
choices
)
==
2
for
i
in
range
(
2
):
assert
completion
.
choices
[
i
].
text
in
sample_guided_choice
@
pytest
.
mark
.
asyncio
async
def
test_guided_grammar
(
client
:
openai
.
AsyncOpenAI
,
sample_sql_statements
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
(
"Generate a sql state that select col_1 from "
"table_1 where it is equals to 1"
),
temperature
=
1.0
,
max_tokens
=
500
,
extra_body
=
dict
(
guided_grammar
=
sample_sql_statements
))
content
=
completion
.
choices
[
0
].
text
# use Lark to parse the output, and make sure it's a valid parse tree
from
lark
import
Lark
parser
=
Lark
(
sample_sql_statements
)
parser
.
parse
(
content
)
# remove spaces for comparison b/c we removed them in the grammar
ground_truth
=
"SELECT col_1 from table_1 where col_1 = 1"
.
replace
(
" "
,
""
)
assert
content
.
strip
()
==
ground_truth
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
)
@
pytest
.
mark
.
parametrize
(
"logprobs_arg"
,
[
1
,
0
])
async
def
test_echo_logprob_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
logprobs_arg
:
int
):
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# test using text and token IDs
for
prompt
in
(
"Hello, my name is"
,
[
0
,
0
,
0
,
0
,
0
]):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
echo
=
True
,
logprobs
=
logprobs_arg
)
prompt_text
=
tokenizer
.
decode
(
prompt
)
if
isinstance
(
prompt
,
list
)
else
prompt
assert
re
.
search
(
r
"^"
+
prompt_text
,
completion
.
choices
[
0
].
text
)
logprobs
=
completion
.
choices
[
0
].
logprobs
assert
logprobs
is
not
None
assert
len
(
logprobs
.
text_offset
)
>
5
assert
(
len
(
logprobs
.
token_logprobs
)
>
5
and
logprobs
.
token_logprobs
[
0
]
is
None
)
assert
(
len
(
logprobs
.
top_logprobs
)
>
5
and
logprobs
.
top_logprobs
[
0
]
is
None
)
for
top_logprobs
in
logprobs
.
top_logprobs
[
1
:]:
assert
max
(
logprobs_arg
,
1
)
<=
len
(
top_logprobs
)
<=
logprobs_arg
+
1
assert
len
(
logprobs
.
tokens
)
>
5
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_decoding_type_error
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_json_schema
,
sample_regex
):
with
pytest
.
raises
(
openai
.
BadRequestError
):
_
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Give an example JSON that fits this schema: 42"
,
extra_body
=
dict
(
guided_json
=
42
,
guided_decoding_backend
=
guided_decoding_backend
))
with
pytest
.
raises
(
openai
.
BadRequestError
):
_
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Give an example string that fits this regex"
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
guided_json
=
sample_json_schema
))
tests/entrypoints/openai/test_encoder_decoder.py
0 → 100644
View file @
af7f4372
import
openai
import
pytest
from
...utils
import
RemoteOpenAIServer
MODEL_NAME
=
"facebook/bart-base"
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
"--dtype"
,
"bfloat16"
,
"--enforce-eager"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
(
server
):
return
server
.
get_async_client
()
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_single_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
temperature
=
0.0
)
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
choice
=
completion
.
choices
[
0
]
assert
len
(
choice
.
text
)
>=
5
assert
choice
.
finish_reason
==
"length"
assert
completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
5
,
prompt_tokens
=
2
,
total_tokens
=
7
)
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
1
tests/entrypoints/openai/test_metrics.py
0 → 100644
View file @
af7f4372
from
http
import
HTTPStatus
import
openai
import
pytest
import
requests
from
prometheus_client.parser
import
text_string_to_metric_families
from
transformers
import
AutoTokenizer
from
...utils
import
RemoteOpenAIServer
MODEL_NAME
=
"TinyLlama/TinyLlama-1.1B-Chat-v1.0"
@
pytest
.
fixture
(
scope
=
"module"
)
def
default_server_args
():
return
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"1024"
,
"--enforce-eager"
,
"--max-num-seqs"
,
"128"
,
]
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
[
""
,
"--enable-chunked-prefill"
,
"--disable-frontend-multiprocessing"
,
])
def
client
(
default_server_args
,
request
):
if
request
.
param
:
default_server_args
.
append
(
request
.
param
)
with
RemoteOpenAIServer
(
MODEL_NAME
,
default_server_args
)
as
remote_server
:
yield
remote_server
.
get_async_client
()
_PROMPT
=
"Hello my name is Robert and I love magic"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_NAME
)
_TOKENIZED_PROMPT
=
tokenizer
(
_PROMPT
)[
"input_ids"
]
_NUM_REQUESTS
=
10
_NUM_PROMPT_TOKENS_PER_REQUEST
=
len
(
_TOKENIZED_PROMPT
)
_NUM_GENERATION_TOKENS_PER_REQUEST
=
10
# {metric_family: [(suffix, expected_value)]}
EXPECTED_VALUES
=
{
"vllm:time_to_first_token_seconds"
:
[(
"_count"
,
_NUM_REQUESTS
)],
"vllm:time_per_output_token_seconds"
:
[(
"_count"
,
_NUM_REQUESTS
*
(
_NUM_GENERATION_TOKENS_PER_REQUEST
-
1
))],
"vllm:e2e_request_latency_seconds"
:
[(
"_count"
,
_NUM_REQUESTS
)],
"vllm:request_prompt_tokens"
:
[(
"_sum"
,
_NUM_REQUESTS
*
_NUM_PROMPT_TOKENS_PER_REQUEST
),
(
"_count"
,
_NUM_REQUESTS
)],
"vllm:request_generation_tokens"
:
[(
"_sum"
,
_NUM_REQUESTS
*
_NUM_GENERATION_TOKENS_PER_REQUEST
),
(
"_count"
,
_NUM_REQUESTS
)],
"vllm:request_params_n"
:
[(
"_count"
,
_NUM_REQUESTS
)],
"vllm:request_params_best_of"
:
[(
"_count"
,
_NUM_REQUESTS
)],
"vllm:prompt_tokens"
:
[(
"_total"
,
_NUM_REQUESTS
*
_NUM_PROMPT_TOKENS_PER_REQUEST
)],
"vllm:generation_tokens"
:
[(
"_total"
,
_NUM_REQUESTS
*
_NUM_PROMPT_TOKENS_PER_REQUEST
)],
"vllm:request_success"
:
[(
"_total"
,
_NUM_REQUESTS
)],
}
@
pytest
.
mark
.
asyncio
async
def
test_metrics_counts
(
client
:
openai
.
AsyncOpenAI
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
for
_
in
range
(
_NUM_REQUESTS
):
# sending a request triggers the metrics to be logged.
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
_TOKENIZED_PROMPT
,
max_tokens
=
_NUM_GENERATION_TOKENS_PER_REQUEST
)
response
=
requests
.
get
(
base_url
+
"/metrics"
)
print
(
response
.
text
)
assert
response
.
status_code
==
HTTPStatus
.
OK
# Loop over all expected metric_families
for
metric_family
,
suffix_values_list
in
EXPECTED_VALUES
.
items
():
found_metric
=
False
# Check to see if the metric_family is found in the prom endpoint.
for
family
in
text_string_to_metric_families
(
response
.
text
):
if
family
.
name
==
metric_family
:
found_metric
=
True
# Check that each suffix is found in the prom endpoint.
for
suffix
,
expected_value
in
suffix_values_list
:
metric_name_w_suffix
=
f
"
{
metric_family
}{
suffix
}
"
found_suffix
=
False
for
sample
in
family
.
samples
:
if
sample
.
name
==
metric_name_w_suffix
:
found_suffix
=
True
# For each suffix, value sure the value matches
# what we expect.
assert
sample
.
value
==
expected_value
,
(
f
"
{
metric_name_w_suffix
}
expected value of "
f
"
{
expected_value
}
did not match found value "
f
"
{
sample
.
value
}
"
)
break
assert
found_suffix
,
(
f
"Did not find
{
metric_name_w_suffix
}
in prom endpoint"
)
break
assert
found_metric
,
(
f
"Did not find
{
metric_family
}
in prom endpoint"
)
EXPECTED_METRICS
=
[
"vllm:num_requests_running"
,
"vllm:num_requests_swapped"
,
"vllm:num_requests_waiting"
,
"vllm:gpu_cache_usage_perc"
,
"vllm:cpu_cache_usage_perc"
,
"vllm:time_to_first_token_seconds_sum"
,
"vllm:time_to_first_token_seconds_bucket"
,
"vllm:time_to_first_token_seconds_count"
,
"vllm:time_per_output_token_seconds_sum"
,
"vllm:time_per_output_token_seconds_bucket"
,
"vllm:time_per_output_token_seconds_count"
,
"vllm:e2e_request_latency_seconds_sum"
,
"vllm:e2e_request_latency_seconds_bucket"
,
"vllm:e2e_request_latency_seconds_count"
,
"vllm:request_prompt_tokens_sum"
,
"vllm:request_prompt_tokens_bucket"
,
"vllm:request_prompt_tokens_count"
,
"vllm:request_generation_tokens_sum"
,
"vllm:request_generation_tokens_bucket"
,
"vllm:request_generation_tokens_count"
,
"vllm:request_params_n_sum"
,
"vllm:request_params_n_bucket"
,
"vllm:request_params_n_count"
,
"vllm:request_params_best_of_sum"
,
"vllm:request_params_best_of_bucket"
,
"vllm:request_params_best_of_count"
,
"vllm:num_preemptions_total"
,
"vllm:prompt_tokens_total"
,
"vllm:generation_tokens_total"
,
"vllm:request_success_total"
,
"vllm:cache_config_info"
,
# labels in cache_config_info
"block_size"
,
"cache_dtype"
,
"cpu_offload_gb"
,
"enable_prefix_caching"
,
"gpu_memory_utilization"
,
"num_cpu_blocks"
,
"num_gpu_blocks"
,
"num_gpu_blocks_override"
,
"sliding_window"
,
"swap_space_bytes"
,
]
@
pytest
.
mark
.
asyncio
async
def
test_metrics_exist
(
client
:
openai
.
AsyncOpenAI
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
# sending a request triggers the metrics to be logged.
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
temperature
=
0.0
)
response
=
requests
.
get
(
base_url
+
"/metrics"
)
assert
response
.
status_code
==
HTTPStatus
.
OK
for
metric
in
EXPECTED_METRICS
:
assert
metric
in
response
.
text
tests/entrypoints/openai/test_mp_api_server.py
0 → 100644
View file @
af7f4372
import
time
import
pytest
from
vllm.entrypoints.openai.api_server
import
build_async_engine_client
from
vllm.entrypoints.openai.cli_args
import
make_arg_parser
from
vllm.utils
import
FlexibleArgumentParser
@
pytest
.
mark
.
asyncio
async
def
test_mp_crash_detection
():
parser
=
FlexibleArgumentParser
(
description
=
"vLLM's remote OpenAI server."
)
parser
=
make_arg_parser
(
parser
)
args
=
parser
.
parse_args
([])
# use an invalid tensor_parallel_size to trigger the
# error in the server
args
.
tensor_parallel_size
=
65536
start
=
time
.
perf_counter
()
async
with
build_async_engine_client
(
args
):
pass
end
=
time
.
perf_counter
()
assert
end
-
start
<
60
,
(
"Expected vLLM to gracefully shutdown in <60s "
"if there is an error in the startup."
)
@
pytest
.
mark
.
asyncio
async
def
test_mp_cuda_init
():
# it should not crash, when cuda is initialized
# in the API server process
import
torch
torch
.
cuda
.
init
()
parser
=
FlexibleArgumentParser
(
description
=
"vLLM's remote OpenAI server."
)
parser
=
make_arg_parser
(
parser
)
args
=
parser
.
parse_args
([])
async
with
build_async_engine_client
(
args
):
pass
tests/entrypoints/openai/test_oot_registration.py
View file @
af7f4372
import
sys
import
time
import
torch
from
openai
import
OpenAI
,
OpenAIError
from
vllm
import
ModelRegistry
from
vllm.model_executor.models.opt
import
OPTForCausalLM
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.utils
import
get_open_port
class
MyOPTForCausalLM
(
OPTForCausalLM
):
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
)
->
torch
.
Tensor
:
# this dummy model always predicts the first token
logits
=
super
().
compute_logits
(
hidden_states
,
sampling_metadata
)
logits
.
zero_
()
logits
[:,
0
]
+=
1.0
return
logits
def
server_function
(
port
):
# register our dummy model
ModelRegistry
.
register_model
(
"OPTForCausalLM"
,
MyOPTForCausalLM
)
sys
.
argv
=
[
"placeholder.py"
]
+
\
(
"--model facebook/opt-125m --gpu-memory-utilization 0.10 "
f
"--dtype float32 --api-key token-abc123 --port
{
port
}
"
).
split
()
import
runpy
runpy
.
run_module
(
'vllm.entrypoints.openai.api_server'
,
run_name
=
'__main__'
)
def
test_oot_registration_for_api_server
():
port
=
get_open_port
()
ctx
=
torch
.
multiprocessing
.
get_context
()
server
=
ctx
.
Process
(
target
=
server_function
,
args
=
(
port
,
))
server
.
start
()
MAX_SERVER_START_WAIT_S
=
60
client
=
OpenAI
(
base_url
=
f
"http://localhost:
{
port
}
/v1"
,
api_key
=
"token-abc123"
,
)
now
=
time
.
time
()
while
True
:
try
:
completion
=
client
.
chat
.
completions
.
create
(
model
=
"facebook/opt-125m"
,
messages
=
[{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"user"
,
"content"
:
"Hello!"
}],
temperature
=
0
,
)
break
except
OpenAIError
as
e
:
if
"Connection error"
in
str
(
e
):
time
.
sleep
(
3
)
if
time
.
time
()
-
now
>
MAX_SERVER_START_WAIT_S
:
raise
RuntimeError
(
"Server did not start in time"
)
from
e
else
:
raise
e
server
.
kill
()
generated_text
=
completion
.
choices
[
0
].
message
.
content
# make sure only the first token is generated
rest
=
generated_text
.
replace
(
"<s>"
,
""
)
assert
rest
==
""
from
...utils
import
VLLM_PATH
,
RemoteOpenAIServer
chatml_jinja_path
=
VLLM_PATH
/
"examples/template_chatml.jinja"
assert
chatml_jinja_path
.
exists
()
def
run_and_test_dummy_opt_api_server
(
model
,
tp
=
1
):
# the model is registered through the plugin
server_args
=
[
"--gpu-memory-utilization"
,
"0.10"
,
"--dtype"
,
"float32"
,
"--chat-template"
,
str
(
chatml_jinja_path
),
"--load-format"
,
"dummy"
,
"-tp"
,
f
"
{
tp
}
"
,
]
with
RemoteOpenAIServer
(
model
,
server_args
)
as
server
:
client
=
server
.
get_client
()
completion
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
[{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"user"
,
"content"
:
"Hello!"
}],
temperature
=
0
,
)
generated_text
=
completion
.
choices
[
0
].
message
.
content
assert
generated_text
is
not
None
# make sure only the first token is generated
rest
=
generated_text
.
replace
(
"<s>"
,
""
)
assert
rest
==
""
def
test_oot_registration_for_api_server
(
dummy_opt_path
:
str
):
run_and_test_dummy_opt_api_server
(
dummy_opt_path
)
tests/entrypoints/openai/test_prompt_validation.py
0 → 100644
View file @
af7f4372
# imports for guided decoding tests
import
re
import
openai
import
pytest
from
...utils
import
RemoteOpenAIServer
@
pytest
.
mark
.
asyncio
async
def
test_empty_prompt
():
model_name
=
"gpt2"
server_args
=
[
"--enforce-eager"
]
with
RemoteOpenAIServer
(
model_name
,
server_args
)
as
remote_server
:
client
=
remote_server
.
get_async_client
()
with
pytest
.
raises
(
openai
.
BadRequestError
,
match
=
re
.
compile
(
'.+Prompt cannot be empty.+'
)):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
""
,
max_tokens
=
5
,
temperature
=
0.0
)
tests/entrypoints/openai/test_run_batch.py
View file @
af7f4372
...
...
@@ -7,13 +7,39 @@ from vllm.entrypoints.openai.protocol import BatchRequestOutput
# ruff: noqa: E501
INPUT_BATCH
=
"""{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
INVALID_INPUT_BATCH
=
"""{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
INPUT_EMBEDDING_BATCH
=
"""{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are an unhelpful assistant."}}
{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "Hello world!"}}
{"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}"""
def
test_e2e
():
def
test_empty_file
():
with
tempfile
.
NamedTemporaryFile
(
"w"
)
as
input_file
,
tempfile
.
NamedTemporaryFile
(
"r"
)
as
output_file
:
input_file
.
write
(
""
)
input_file
.
flush
()
proc
=
subprocess
.
Popen
([
sys
.
executable
,
"-m"
,
"vllm.entrypoints.openai.run_batch"
,
"-i"
,
input_file
.
name
,
"-o"
,
output_file
.
name
,
"--model"
,
"intfloat/e5-mistral-7b-instruct"
],
)
proc
.
communicate
()
proc
.
wait
()
assert
proc
.
returncode
==
0
,
f
"
{
proc
=
}
"
contents
=
output_file
.
read
()
assert
contents
.
strip
()
==
""
def
test_completions
():
with
tempfile
.
NamedTemporaryFile
(
"w"
)
as
input_file
,
tempfile
.
NamedTemporaryFile
(
"r"
)
as
output_file
:
...
...
@@ -35,7 +61,7 @@ def test_e2e():
BatchRequestOutput
.
model_validate_json
(
line
)
def
test_
e2e
_invalid_input
():
def
test_
completions
_invalid_input
():
"""
Ensure that we fail when the input doesn't conform to the openai api.
"""
...
...
@@ -52,3 +78,25 @@ def test_e2e_invalid_input():
proc
.
communicate
()
proc
.
wait
()
assert
proc
.
returncode
!=
0
,
f
"
{
proc
=
}
"
def
test_embeddings
():
with
tempfile
.
NamedTemporaryFile
(
"w"
)
as
input_file
,
tempfile
.
NamedTemporaryFile
(
"r"
)
as
output_file
:
input_file
.
write
(
INPUT_EMBEDDING_BATCH
)
input_file
.
flush
()
proc
=
subprocess
.
Popen
([
sys
.
executable
,
"-m"
,
"vllm.entrypoints.openai.run_batch"
,
"-i"
,
input_file
.
name
,
"-o"
,
output_file
.
name
,
"--model"
,
"intfloat/e5-mistral-7b-instruct"
],
)
proc
.
communicate
()
proc
.
wait
()
assert
proc
.
returncode
==
0
,
f
"
{
proc
=
}
"
contents
=
output_file
.
read
()
for
line
in
contents
.
strip
().
split
(
"
\n
"
):
# Ensure that the output format conforms to the openai api.
# Validation should throw if the schema is wrong.
BatchRequestOutput
.
model_validate_json
(
line
)
tests/entrypoints/openai/test_serving_chat.py
View file @
af7f4372
...
...
@@ -73,7 +73,6 @@ def test_serving_chat_should_set_correct_max_tokens():
with
suppress
(
Exception
):
asyncio
.
run
(
serving_chat
.
create_chat_completion
(
req
))
# AsyncLLMEngine.generate(inputs, sampling_params, ...)
assert
mock_engine
.
generate
.
call_args
.
args
[
1
].
max_tokens
==
93
req
.
max_tokens
=
10
...
...
tests/entrypoints/openai/test_shutdown.py
0 → 100644
View file @
af7f4372
import
json
import
os
import
openai
import
pytest
from
...utils
import
RemoteOpenAIServer
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
@
pytest
.
mark
.
asyncio
async
def
test_shutdown_on_engine_failure
(
tmp_path
):
# Use a bad adapter to crash the engine
# (This test will fail when that bug is fixed)
adapter_path
=
tmp_path
/
"bad_adapter"
os
.
mkdir
(
adapter_path
)
with
open
(
adapter_path
/
"adapter_model_config.json"
,
"w"
)
as
f
:
json
.
dump
({
"not"
:
"real"
},
f
)
with
open
(
adapter_path
/
"adapter_model.safetensors"
,
"wb"
)
as
f
:
f
.
write
(
b
"this is fake"
)
# dtype, max-len etc set so that this can run in CI
args
=
[
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
"--max-num-seqs"
,
"128"
,
"--enable-lora"
,
"--lora-modules"
,
f
"bad-adapter=
{
tmp_path
/
'bad_adapter'
}
"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
client
=
remote_server
.
get_async_client
()
with
pytest
.
raises
(
openai
.
APIConnectionError
):
# This crashes the engine
await
client
.
completions
.
create
(
model
=
"bad-adapter"
,
prompt
=
"Hello, my name is"
)
# Now the server should shut down
return_code
=
remote_server
.
proc
.
wait
(
timeout
=
1
)
assert
return_code
is
not
None
tests/kernels/quant_utils.py
View file @
af7f4372
...
...
@@ -2,6 +2,13 @@ from typing import Optional, Tuple, Union
import
torch
from
vllm.utils
import
is_hip
# Using the default value (240.0) from pytorch will cause accuracy
# issue on dynamic quantization models. Here use 224.0 for rocm.
ROCM_FP8_MAX
=
224.0
FP8_DTYPE
=
torch
.
float8_e4m3fnuz
if
is_hip
()
else
torch
.
float8_e4m3fn
def
as_float32_tensor
(
x
:
Union
[
float
,
torch
.
tensor
])
->
torch
.
tensor
:
return
torch
.
as_tensor
(
x
,
dtype
=
torch
.
float32
,
device
=
'cuda'
)
...
...
@@ -11,13 +18,15 @@ def ref_dynamic_per_token_quant(x: torch.tensor,
scale_ub
:
Optional
[
torch
.
tensor
]
=
None
)
\
->
Tuple
[
torch
.
tensor
,
torch
.
tensor
]:
assert
quant_dtype
in
[
torch
.
int8
,
torch
.
float8_e4m3fn
]
assert
quant_dtype
in
[
torch
.
int8
,
FP8_DTYPE
]
if
scale_ub
is
not
None
:
assert
quant_dtype
==
torch
.
float8_e4m3fn
assert
quant_dtype
==
FP8_DTYPE
qtype_traits
=
torch
.
iinfo
(
quant_dtype
)
if
quant_dtype
==
torch
.
int8
\
else
torch
.
finfo
(
quant_dtype
)
qtype_max
=
as_float32_tensor
(
qtype_traits
.
max
)
qtype_traits_max
=
ROCM_FP8_MAX
if
is_hip
()
else
qtype_traits
.
max
qtype_traits_min
=
-
ROCM_FP8_MAX
if
is_hip
()
else
qtype_traits
.
min
qtype_max
=
as_float32_tensor
(
qtype_traits_max
)
s_1
=
as_float32_tensor
(
1.0
)
s_512
=
as_float32_tensor
(
512.0
)
...
...
@@ -37,15 +46,15 @@ def ref_dynamic_per_token_quant(x: torch.tensor,
iscales
=
as_float32_tensor
(
s_1
/
scales
)
torch_out
=
as_float32_tensor
(
x
)
*
iscales
torch_out
=
torch_out
.
round
()
torch_out
=
torch_out
.
clamp
(
qtype_traits
.
min
,
qtype_traits
.
max
).
to
(
quant_dtype
)
torch_out
=
torch_out
.
clamp
(
qtype_traits
_
min
,
qtype_traits
_
max
).
to
(
quant_dtype
)
else
:
assert
quant_dtype
==
torch
.
float8_e4m3fn
assert
quant_dtype
==
FP8_DTYPE
min_scaling_factor
=
s_1
/
(
qtype_max
*
s_512
)
scales
=
scales
.
clamp
(
min
=
min_scaling_factor
)
torch_out
=
as_float32_tensor
(
x
)
/
scales
torch_out
=
torch_out
.
clamp
(
qtype_traits
.
min
,
qtype_traits
.
max
).
to
(
quant_dtype
)
torch_out
=
torch_out
.
clamp
(
qtype_traits
_
min
,
qtype_traits
_
max
).
to
(
quant_dtype
)
return
torch_out
,
scales
...
...
@@ -56,8 +65,10 @@ def ref_dynamic_per_token_quant(x: torch.tensor,
def
ref_dynamic_per_tensor_fp8_quant
(
x
:
torch
.
tensor
)
\
->
Tuple
[
torch
.
tensor
,
torch
.
tensor
]:
fp8_traits
=
torch
.
finfo
(
torch
.
float8_e4m3fn
)
fp8_max
=
as_float32_tensor
(
fp8_traits
.
max
)
fp8_traits
=
torch
.
finfo
(
FP8_DTYPE
)
fp8_traits_max
=
ROCM_FP8_MAX
if
is_hip
()
else
fp8_traits
.
max
fp8_traits_min
=
-
ROCM_FP8_MAX
if
is_hip
()
else
fp8_traits
.
min
fp8_max
=
as_float32_tensor
(
fp8_traits_max
)
one
=
as_float32_tensor
(
1.0
)
# For fp8, in order to match the cuda kernel output, we have to do exactly
...
...
@@ -68,5 +79,5 @@ def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \
ref_scale
=
x_max
/
fp8_max
ref_iscale
=
one
/
ref_scale
ref_out
=
(
as_float32_tensor
(
x
)
*
ref_iscale
).
clamp
(
fp8_traits
.
min
,
fp8_traits
.
max
).
to
(
dtype
=
torch
.
float8_e4m3fn
)
return
ref_out
,
ref_scale
fp8_traits
_
min
,
fp8_traits
_
max
).
to
(
FP8_DTYPE
)
return
ref_out
,
ref_scale
.
view
((
1
,
))
tests/kernels/test_activation.py
View file @
af7f4372
...
...
@@ -47,7 +47,7 @@ def test_act_and_mul(
ref_out
=
layer
.
forward_native
(
x
)
# The SiLU and GELU implementations are equivalent to the native PyTorch
# implementations, so we can do exact comparison.
assert
torch
.
all
close
(
out
,
ref_out
,
atol
=
0.0
,
rtol
=
0.0
)
torch
.
testing
.
assert_
close
(
out
,
ref_out
,
atol
=
0.0
,
rtol
=
0.0
)
@
pytest
.
mark
.
parametrize
(
"activation"
,
[
FastGELU
,
NewGELU
])
...
...
@@ -73,7 +73,7 @@ def test_activation(
layer
=
activation
()
out
=
layer
(
x
)
ref_out
=
layer
.
forward_native
(
x
)
assert
torch
.
all
close
(
out
,
ref_out
,
atol
=
get_default_atol
(
out
),
rtol
=
get_default_rtol
(
out
))
torch
.
testing
.
assert_
close
(
out
,
ref_out
,
atol
=
get_default_atol
(
out
),
rtol
=
get_default_rtol
(
out
))
tests/kernels/test_attention.py
View file @
af7f4372
...
...
@@ -277,7 +277,7 @@ def test_paged_attention(
atol
,
rtol
=
1e-3
,
1e-5
if
kv_cache_dtype
==
"fp8"
:
atol
,
rtol
=
1e-2
,
1e-5
assert
torch
.
all
close
(
output
,
ref_output
,
atol
=
atol
,
rtol
=
rtol
)
torch
.
testing
.
assert_
close
(
output
,
ref_output
,
atol
=
atol
,
rtol
=
rtol
)
def
ref_multi_query_kv_attention
(
...
...
@@ -382,4 +382,4 @@ def test_multi_query_kv_attention(
)
atol
=
get_default_atol
(
output
)
if
is_hip
()
else
1e-3
rtol
=
get_default_rtol
(
output
)
if
is_hip
()
else
1e-5
assert
torch
.
all
close
(
output
,
ref_output
,
atol
=
atol
,
rtol
=
rtol
)
torch
.
testing
.
assert_
close
(
output
,
ref_output
,
atol
=
atol
,
rtol
=
rtol
)
tests/kernels/test_attention_selector.py
View file @
af7f4372
...
...
@@ -3,9 +3,9 @@ from unittest.mock import patch
import
pytest
import
torch
from
tests.kernels.utils
import
(
STR_FLASH_ATTN_VAL
,
STR_INVALID_VAL
,
override_backend_env_variable
)
from
tests.kernels.utils
import
override_backend_env_variable
from
vllm.attention.selector
import
which_attn_to_use
from
vllm.utils
import
STR_FLASH_ATTN_VAL
,
STR_INVALID_VAL
@
pytest
.
mark
.
parametrize
(
...
...
Prev
1
…
5
6
7
8
9
10
11
12
13
…
23
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment