Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0640f227
Commit
0640f227
authored
Sep 09, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.6.0' into v0.6.0-dev
parents
82f1ffdf
32e7db25
Changes
335
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
994 additions
and
173 deletions
+994
-173
tests/core/utils.py
tests/core/utils.py
+1
-1
tests/data/test_config.yaml
tests/data/test_config.yaml
+2
-0
tests/engine/test_stop_strings.py
tests/engine/test_stop_strings.py
+103
-52
tests/entrypoints/llm/test_generate.py
tests/entrypoints/llm/test_generate.py
+34
-0
tests/entrypoints/llm/test_lazy_outlines.py
tests/entrypoints/llm/test_lazy_outlines.py
+48
-0
tests/entrypoints/openai/test_audio.py
tests/entrypoints/openai/test_audio.py
+5
-3
tests/entrypoints/openai/test_basic.py
tests/entrypoints/openai/test_basic.py
+5
-3
tests/entrypoints/openai/test_chat.py
tests/entrypoints/openai/test_chat.py
+38
-3
tests/entrypoints/openai/test_completion.py
tests/entrypoints/openai/test_completion.py
+9
-2
tests/entrypoints/openai/test_embedding.py
tests/entrypoints/openai/test_embedding.py
+15
-5
tests/entrypoints/openai/test_encoder_decoder.py
tests/entrypoints/openai/test_encoder_decoder.py
+5
-3
tests/entrypoints/openai/test_metrics.py
tests/entrypoints/openai/test_metrics.py
+58
-2
tests/entrypoints/openai/test_models.py
tests/entrypoints/openai/test_models.py
+5
-3
tests/entrypoints/openai/test_return_tokens_as_ids.py
tests/entrypoints/openai/test_return_tokens_as_ids.py
+51
-47
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_chat.py
+2
-0
tests/entrypoints/openai/test_shutdown.py
tests/entrypoints/openai/test_shutdown.py
+9
-8
tests/entrypoints/openai/test_tokenization.py
tests/entrypoints/openai/test_tokenization.py
+5
-3
tests/entrypoints/openai/test_vision.py
tests/entrypoints/openai/test_vision.py
+41
-38
tests/entrypoints/test_chat_utils.py
tests/entrypoints/test_chat_utils.py
+389
-0
tests/kernels/test_awq_triton.py
tests/kernels/test_awq_triton.py
+169
-0
No files found.
tests/core/utils.py
View file @
0640f227
...
@@ -199,7 +199,7 @@ def append_new_token(out, token_id: int):
...
@@ -199,7 +199,7 @@ def append_new_token(out, token_id: int):
def
schedule_and_update_computed_tokens
(
scheduler
):
def
schedule_and_update_computed_tokens
(
scheduler
):
metas
,
out
=
scheduler
.
schedule
()
metas
,
out
,
_
=
scheduler
.
schedule
()
for
s
,
meta
in
zip
(
out
.
scheduled_seq_groups
,
metas
):
for
s
,
meta
in
zip
(
out
.
scheduled_seq_groups
,
metas
):
s
.
seq_group
.
update_num_computed_tokens
(
meta
.
token_chunk_size
)
s
.
seq_group
.
update_num_computed_tokens
(
meta
.
token_chunk_size
)
return
metas
,
out
return
metas
,
out
...
...
tests/data/test_config.yaml
0 → 100644
View file @
0640f227
port
:
12312
tensor_parallel_size
:
2
tests/engine/test_stop_strings.py
View file @
0640f227
...
@@ -7,6 +7,8 @@ from vllm import CompletionOutput, LLMEngine, SamplingParams
...
@@ -7,6 +7,8 @@ from vllm import CompletionOutput, LLMEngine, SamplingParams
MODEL
=
"meta-llama/llama-2-7b-hf"
MODEL
=
"meta-llama/llama-2-7b-hf"
MAX_TOKENS
=
200
MAX_TOKENS
=
200
IS_ASYNC
=
False
@
pytest
.
fixture
(
scope
=
"session"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
vllm_model
(
vllm_runner
):
def
vllm_model
(
vllm_runner
):
...
@@ -14,99 +16,148 @@ def vllm_model(vllm_runner):
...
@@ -14,99 +16,148 @@ def vllm_model(vllm_runner):
yield
vllm_model
yield
vllm_model
@
pytest
.
mark
.
skip_global_cleanup
def
_test_stopping
(
llm_engine
:
LLMEngine
,
def
test_stop_basic
(
vllm_model
):
expected_output
:
str
,
_test_stopping
(
vllm_model
.
model
.
llm_engine
,
expected_reason
:
Any
,
stop
:
Optional
[
List
[
str
]]
=
None
,
stop_token_ids
:
Optional
[
List
[
int
]]
=
None
,
include_in_output
:
bool
=
False
,
use_async_output_proc
:
bool
=
False
)
->
None
:
llm_engine
.
add_request
(
"id"
,
"A story about vLLM:
\n
"
,
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
MAX_TOKENS
,
stop
=
stop
,
stop_token_ids
=
stop_token_ids
,
include_stop_str_in_output
=
include_in_output
,
),
None
)
output
:
Optional
[
CompletionOutput
]
=
None
output_text
=
""
stop_reason
=
None
if
use_async_output_proc
:
llm_engine
.
step
()
while
llm_engine
.
has_unfinished_requests
():
(
request_output
,
)
=
llm_engine
.
step
()
(
output
,
)
=
request_output
.
outputs
# Ensure we don't backtrack
assert
output
.
text
.
startswith
(
output_text
)
output_text
=
output
.
text
stop_reason
=
output
.
stop_reason
assert
output
is
not
None
assert
output_text
==
expected_output
assert
stop_reason
==
expected_reason
def
_set_async_mode
(
llm_engine
,
is_async
):
llm_engine
.
scheduler
[
0
].
use_async_output_proc
=
is_async
def
_stop_basic
(
llm_engine
,
is_async
):
_test_stopping
(
llm_engine
,
stop
=
[
"."
],
stop
=
[
"."
],
include_in_output
=
False
,
include_in_output
=
False
,
expected_output
=
"VLLM is a 100% volunteer organization"
,
expected_output
=
"VLLM is a 100% volunteer organization"
,
expected_reason
=
"."
)
expected_reason
=
"."
,
use_async_output_proc
=
is_async
)
_test_stopping
(
vllm_model
.
model
.
llm_engine
,
_test_stopping
(
llm_engine
,
stop
=
[
"."
],
stop
=
[
"."
],
include_in_output
=
True
,
include_in_output
=
True
,
expected_output
=
"VLLM is a 100% volunteer organization."
,
expected_output
=
"VLLM is a 100% volunteer organization."
,
expected_reason
=
"."
)
expected_reason
=
"."
,
use_async_output_proc
=
is_async
)
@
pytest
.
mark
.
skip_global_cleanup
def
_stop_multi_tokens
(
llm_engine
,
is_async
):
def
test_stop_multi_tokens
(
vllm_model
):
_test_stopping
(
_test_stopping
(
vllm_model
.
model
.
llm_engine
,
llm_engine
,
stop
=
[
"group of peo"
,
"short"
],
stop
=
[
"group of peo"
,
"short"
],
include_in_output
=
False
,
include_in_output
=
False
,
expected_output
=
"VLLM is a 100% volunteer organization. We are a "
,
expected_output
=
"VLLM is a 100% volunteer organization. We are a "
,
expected_reason
=
"group of peo"
)
expected_reason
=
"group of peo"
,
use_async_output_proc
=
is_async
)
_test_stopping
(
_test_stopping
(
vllm_model
.
model
.
llm_engine
,
llm_engine
,
stop
=
[
"group of peo"
,
"short"
],
stop
=
[
"group of peo"
,
"short"
],
include_in_output
=
True
,
include_in_output
=
True
,
expected_output
=
expected_output
=
"VLLM is a 100% volunteer organization. We are a group of peo"
,
"VLLM is a 100% volunteer organization. We are a group of peo"
,
expected_reason
=
"group of peo"
)
expected_reason
=
"group of peo"
,
use_async_output_proc
=
is_async
)
@
pytest
.
mark
.
skip_global_cleanup
def
_stop_partial_token
(
llm_engine
,
is_async
):
def
test_stop_partial_token
(
vllm_model
):
_test_stopping
(
llm_engine
,
_test_stopping
(
vllm_model
.
model
.
llm_engine
,
stop
=
[
"gani"
],
stop
=
[
"gani"
],
include_in_output
=
False
,
include_in_output
=
False
,
expected_output
=
"VLLM is a 100% volunteer or"
,
expected_output
=
"VLLM is a 100% volunteer or"
,
expected_reason
=
"gani"
)
expected_reason
=
"gani"
,
use_async_output_proc
=
is_async
)
_test_stopping
(
vllm_model
.
model
.
llm_engine
,
_test_stopping
(
llm_engine
,
stop
=
[
"gani"
],
stop
=
[
"gani"
],
include_in_output
=
True
,
include_in_output
=
True
,
expected_output
=
"VLLM is a 100% volunteer organi"
,
expected_output
=
"VLLM is a 100% volunteer organi"
,
expected_reason
=
"gani"
)
expected_reason
=
"gani"
,
use_async_output_proc
=
is_async
)
@
pytest
.
mark
.
skip_global_cleanup
def
_stop_token_id
(
llm_engine
,
is_async
):
def
test_stop_token_id
(
vllm_model
):
# token id 13013 => " organization"
# token id 13013 => " organization"
_test_stopping
(
vllm_model
.
model
.
llm_engine
,
_test_stopping
(
llm_engine
,
stop_token_ids
=
[
13013
],
stop_token_ids
=
[
13013
],
include_in_output
=
False
,
include_in_output
=
False
,
expected_output
=
"VLLM is a 100% volunteer"
,
expected_output
=
"VLLM is a 100% volunteer"
,
expected_reason
=
13013
)
expected_reason
=
13013
,
use_async_output_proc
=
is_async
)
_test_stopping
(
vllm_model
.
model
.
llm_engine
,
_test_stopping
(
llm_engine
,
stop_token_ids
=
[
13013
],
stop_token_ids
=
[
13013
],
include_in_output
=
True
,
include_in_output
=
True
,
expected_output
=
"VLLM is a 100% volunteer organization"
,
expected_output
=
"VLLM is a 100% volunteer organization"
,
expected_reason
=
13013
)
expected_reason
=
13013
,
use_async_output_proc
=
is_async
)
def
_test_stopping
(
llm_engine
:
LLMEngine
,
@
pytest
.
mark
.
skip_global_cleanup
expected_output
:
str
,
def
test_stop_basic
(
vllm_model
):
expected_reason
:
Any
,
_set_async_mode
(
vllm_model
.
model
.
llm_engine
,
True
)
stop
:
Optional
[
List
[
str
]]
=
None
,
_stop_basic
(
vllm_model
.
model
.
llm_engine
,
is_async
=
True
)
stop_token_ids
:
Optional
[
List
[
int
]]
=
None
,
include_in_output
:
bool
=
False
)
->
None
:
llm_engine
.
add_request
(
"id"
,
"A story about vLLM:
\n
"
,
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
MAX_TOKENS
,
stop
=
stop
,
stop_token_ids
=
stop_token_ids
,
include_stop_str_in_output
=
include_in_output
,
),
None
)
output
:
Optional
[
CompletionOutput
]
=
None
_set_async_mode
(
vllm_model
.
model
.
llm_engine
,
False
)
output_text
=
""
_stop_basic
(
vllm_model
.
model
.
llm_engine
,
is_async
=
False
)
stop_reason
=
None
while
llm_engine
.
has_unfinished_requests
():
(
request_output
,
)
=
llm_engine
.
step
()
(
output
,
)
=
request_output
.
outputs
# Ensure we don't backtrack
assert
output
.
text
.
startswith
(
output_text
)
output_text
=
output
.
text
stop_reason
=
output
.
stop_reason
assert
output
is
not
None
@
pytest
.
mark
.
skip_global_cleanup
assert
output_text
==
expected_output
def
test_stop_multi_tokens
(
vllm_model
):
assert
stop_reason
==
expected_reason
_set_async_mode
(
vllm_model
.
model
.
llm_engine
,
True
)
_stop_multi_tokens
(
vllm_model
.
model
.
llm_engine
,
is_async
=
True
)
_set_async_mode
(
vllm_model
.
model
.
llm_engine
,
False
)
_stop_multi_tokens
(
vllm_model
.
model
.
llm_engine
,
is_async
=
False
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_stop_partial_token
(
vllm_model
):
_set_async_mode
(
vllm_model
.
model
.
llm_engine
,
True
)
_stop_partial_token
(
vllm_model
.
model
.
llm_engine
,
is_async
=
True
)
_set_async_mode
(
vllm_model
.
model
.
llm_engine
,
False
)
_stop_partial_token
(
vllm_model
.
model
.
llm_engine
,
is_async
=
False
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_stop_token_id
(
vllm_model
):
_set_async_mode
(
vllm_model
.
model
.
llm_engine
,
True
)
_stop_token_id
(
vllm_model
.
model
.
llm_engine
,
is_async
=
True
)
_set_async_mode
(
vllm_model
.
model
.
llm_engine
,
False
)
_stop_token_id
(
vllm_model
.
model
.
llm_engine
,
is_async
=
False
)
tests/entrypoints/llm/test_generate.py
View file @
0640f227
...
@@ -6,6 +6,7 @@ import pytest
...
@@ -6,6 +6,7 @@ import pytest
from
vllm
import
LLM
,
RequestOutput
,
SamplingParams
from
vllm
import
LLM
,
RequestOutput
,
SamplingParams
from
...conftest
import
cleanup
from
...conftest
import
cleanup
from
..openai.test_vision
import
TEST_IMAGE_URLS
MODEL_NAME
=
"facebook/opt-125m"
MODEL_NAME
=
"facebook/opt-125m"
...
@@ -159,3 +160,36 @@ def test_chat():
...
@@ -159,3 +160,36 @@ def test_chat():
]
]
outputs
=
llm
.
chat
(
messages
)
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
==
1
assert
len
(
outputs
)
==
1
@
pytest
.
mark
.
parametrize
(
"image_urls"
,
[[
TEST_IMAGE_URLS
[
0
],
TEST_IMAGE_URLS
[
1
]]])
def
test_chat_multi_image
(
image_urls
:
List
[
str
]):
llm
=
LLM
(
model
=
"microsoft/Phi-3.5-vision-instruct"
,
dtype
=
"bfloat16"
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
enforce_eager
=
True
,
trust_remote_code
=
True
,
limit_mm_per_prompt
=
{
"image"
:
2
},
)
messages
=
[{
"role"
:
"user"
,
"content"
:
[
*
({
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
}
for
image_url
in
image_urls
),
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
],
}]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
>=
0
tests/entrypoints/llm/test_lazy_outlines.py
0 → 100644
View file @
0640f227
import
sys
from
vllm
import
LLM
,
SamplingParams
def
test_lazy_outlines
(
sample_regex
):
"""If users don't use guided decoding, outlines should not be imported.
"""
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
llm
=
LLM
(
model
=
"facebook/opt-125m"
,
enforce_eager
=
True
,
gpu_memory_utilization
=
0.3
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
# make sure outlines is not imported
assert
'outlines'
not
in
sys
.
modules
llm
=
LLM
(
model
=
"facebook/opt-125m"
,
enforce_eager
=
True
,
guided_decoding_backend
=
"lm-format-enforcer"
,
gpu_memory_utilization
=
0.3
)
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
outputs
=
llm
.
generate
(
prompts
=
[
f
"Give an example IPv4 address with this regex:
{
sample_regex
}
"
]
*
2
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_regex
=
sample_regex
))
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
# make sure outlines is not imported
assert
'outlines'
not
in
sys
.
modules
tests/entrypoints/openai/test_audio.py
View file @
0640f227
...
@@ -2,6 +2,7 @@ from typing import Dict, List
...
@@ -2,6 +2,7 @@ from typing import Dict, List
import
openai
import
openai
import
pytest
import
pytest
import
pytest_asyncio
from
vllm.assets.audio
import
AudioAsset
from
vllm.assets.audio
import
AudioAsset
from
vllm.multimodal.utils
import
encode_audio_base64
,
fetch_audio
from
vllm.multimodal.utils
import
encode_audio_base64
,
fetch_audio
...
@@ -28,9 +29,10 @@ def server():
...
@@ -28,9 +29,10 @@ def server():
yield
remote_server
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest_asyncio
.
fixture
def
client
(
server
):
async
def
client
(
server
):
return
server
.
get_async_client
()
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
fixture
(
scope
=
"session"
)
@
pytest
.
fixture
(
scope
=
"session"
)
...
...
tests/entrypoints/openai/test_basic.py
View file @
0640f227
...
@@ -2,6 +2,7 @@ from http import HTTPStatus
...
@@ -2,6 +2,7 @@ from http import HTTPStatus
import
openai
import
openai
import
pytest
import
pytest
import
pytest_asyncio
import
requests
import
requests
from
vllm.version
import
__version__
as
VLLM_VERSION
from
vllm.version
import
__version__
as
VLLM_VERSION
...
@@ -28,9 +29,10 @@ def server():
...
@@ -28,9 +29,10 @@ def server():
yield
remote_server
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest_asyncio
.
fixture
def
client
(
server
):
async
def
client
(
server
):
return
server
.
get_async_client
()
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
...
tests/entrypoints/openai/test_chat.py
View file @
0640f227
...
@@ -6,6 +6,7 @@ from typing import Dict, List, Optional
...
@@ -6,6 +6,7 @@ from typing import Dict, List, Optional
import
jsonschema
import
jsonschema
import
openai
# use the official client for correctness check
import
openai
# use the official client for correctness check
import
pytest
import
pytest
import
pytest_asyncio
import
torch
import
torch
from
openai
import
BadRequestError
from
openai
import
BadRequestError
...
@@ -46,9 +47,10 @@ def server(zephyr_lora_files, zephyr_lora_added_tokens_files): # noqa: F811
...
@@ -46,9 +47,10 @@ def server(zephyr_lora_files, zephyr_lora_added_tokens_files): # noqa: F811
yield
remote_server
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest_asyncio
.
fixture
def
client
(
server
):
async
def
client
(
server
):
return
server
.
get_async_client
()
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
@@ -837,6 +839,39 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI):
...
@@ -837,6 +839,39 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI):
assert
loaded
==
{
"result"
:
2
},
loaded
assert
loaded
==
{
"result"
:
2
},
loaded
@
pytest
.
mark
.
asyncio
async
def
test_response_format_json_schema
(
client
:
openai
.
AsyncOpenAI
):
for
_
in
range
(
2
):
resp
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
[{
"role"
:
"user"
,
"content"
:
(
'what is 1+1? please respond with a JSON object, '
'the format is {"result": 2}'
)
}],
response_format
=
{
"type"
:
"json_schema"
,
"json_schema"
:
{
"name"
:
"foo_test"
,
"schema"
:
{
"type"
:
"object"
,
"properties"
:
{
"result"
:
{
"type"
:
"integer"
},
},
},
}
})
content
=
resp
.
choices
[
0
].
message
.
content
assert
content
is
not
None
loaded
=
json
.
loads
(
content
)
assert
loaded
==
{
"result"
:
2
},
loaded
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_extra_fields
(
client
:
openai
.
AsyncOpenAI
):
async
def
test_extra_fields
(
client
:
openai
.
AsyncOpenAI
):
with
pytest
.
raises
(
BadRequestError
)
as
exc_info
:
with
pytest
.
raises
(
BadRequestError
)
as
exc_info
:
...
...
tests/entrypoints/openai/test_completion.py
View file @
0640f227
...
@@ -8,6 +8,7 @@ from typing import Dict, List, Optional
...
@@ -8,6 +8,7 @@ from typing import Dict, List, Optional
import
jsonschema
import
jsonschema
import
openai
# use the official client for correctness check
import
openai
# use the official client for correctness check
import
pytest
import
pytest
import
pytest_asyncio
# downloading lora to test lora requests
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
huggingface_hub
import
snapshot_download
from
openai
import
BadRequestError
from
openai
import
BadRequestError
...
@@ -89,11 +90,17 @@ def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
...
@@ -89,11 +90,17 @@ def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
@
pytest
.
fixture
(
scope
=
"module"
,
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
[
""
,
"--disable-frontend-multiprocessing"
])
params
=
[
""
,
"--disable-frontend-multiprocessing"
])
def
client
(
default_server_args
,
request
):
def
server
(
default_server_args
,
request
):
if
request
.
param
:
if
request
.
param
:
default_server_args
.
append
(
request
.
param
)
default_server_args
.
append
(
request
.
param
)
with
RemoteOpenAIServer
(
MODEL_NAME
,
default_server_args
)
as
remote_server
:
with
RemoteOpenAIServer
(
MODEL_NAME
,
default_server_args
)
as
remote_server
:
yield
remote_server
.
get_async_client
()
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
client
(
server
):
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
...
tests/entrypoints/openai/test_embedding.py
View file @
0640f227
...
@@ -3,6 +3,7 @@ import base64
...
@@ -3,6 +3,7 @@ import base64
import
numpy
as
np
import
numpy
as
np
import
openai
import
openai
import
pytest
import
pytest
import
pytest_asyncio
from
...utils
import
RemoteOpenAIServer
from
...utils
import
RemoteOpenAIServer
...
@@ -24,10 +25,10 @@ def embedding_server():
...
@@ -24,10 +25,10 @@ def embedding_server():
yield
remote_server
yield
remote_server
@
pytest
.
mark
.
asyncio
@
pytest
_
asyncio
.
fixture
@
pytest
.
fixture
(
scope
=
"module"
)
async
def
embedding_client
(
embedding_server
):
def
embedding_client
(
embedding_server
)
:
async
with
embedding_server
.
get_async_client
()
as
async_client
:
return
embedding_server
.
get_
async_client
()
yield
async_client
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
@@ -128,9 +129,18 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
...
@@ -128,9 +129,18 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
for
data
in
responses_base64
.
data
:
for
data
in
responses_base64
.
data
:
decoded_responses_base64_data
.
append
(
decoded_responses_base64_data
.
append
(
np
.
frombuffer
(
base64
.
b64decode
(
data
.
embedding
),
np
.
frombuffer
(
base64
.
b64decode
(
data
.
embedding
),
dtype
=
"float"
).
tolist
())
dtype
=
"float
32
"
).
tolist
())
assert
responses_float
.
data
[
0
].
embedding
==
decoded_responses_base64_data
[
assert
responses_float
.
data
[
0
].
embedding
==
decoded_responses_base64_data
[
0
]
0
]
assert
responses_float
.
data
[
1
].
embedding
==
decoded_responses_base64_data
[
assert
responses_float
.
data
[
1
].
embedding
==
decoded_responses_base64_data
[
1
]
1
]
# Default response is float32 decoded from base64 by OpenAI Client
responses_default
=
await
embedding_client
.
embeddings
.
create
(
input
=
input_texts
,
model
=
model_name
)
assert
responses_float
.
data
[
0
].
embedding
==
responses_default
.
data
[
0
].
embedding
assert
responses_float
.
data
[
1
].
embedding
==
responses_default
.
data
[
1
].
embedding
tests/entrypoints/openai/test_encoder_decoder.py
View file @
0640f227
import
openai
import
openai
import
pytest
import
pytest
import
pytest_asyncio
from
...utils
import
RemoteOpenAIServer
from
...utils
import
RemoteOpenAIServer
...
@@ -18,9 +19,10 @@ def server():
...
@@ -18,9 +19,10 @@ def server():
yield
remote_server
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest_asyncio
.
fixture
def
client
(
server
):
async
def
client
(
server
):
return
server
.
get_async_client
()
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
...
tests/entrypoints/openai/test_metrics.py
View file @
0640f227
import
subprocess
import
sys
import
tempfile
import
time
from
http
import
HTTPStatus
from
http
import
HTTPStatus
import
openai
import
openai
import
pytest
import
pytest
import
pytest_asyncio
import
requests
import
requests
from
prometheus_client.parser
import
text_string_to_metric_families
from
prometheus_client.parser
import
text_string_to_metric_families
from
transformers
import
AutoTokenizer
from
transformers
import
AutoTokenizer
...
@@ -31,11 +36,17 @@ def default_server_args():
...
@@ -31,11 +36,17 @@ def default_server_args():
"--enable-chunked-prefill"
,
"--enable-chunked-prefill"
,
"--disable-frontend-multiprocessing"
,
"--disable-frontend-multiprocessing"
,
])
])
def
client
(
default_server_args
,
request
):
def
server
(
default_server_args
,
request
):
if
request
.
param
:
if
request
.
param
:
default_server_args
.
append
(
request
.
param
)
default_server_args
.
append
(
request
.
param
)
with
RemoteOpenAIServer
(
MODEL_NAME
,
default_server_args
)
as
remote_server
:
with
RemoteOpenAIServer
(
MODEL_NAME
,
default_server_args
)
as
remote_server
:
yield
remote_server
.
get_async_client
()
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
client
(
server
):
async
with
server
.
get_async_client
()
as
cl
:
yield
cl
_PROMPT
=
"Hello my name is Robert and I love magic"
_PROMPT
=
"Hello my name is Robert and I love magic"
...
@@ -177,3 +188,48 @@ async def test_metrics_exist(client: openai.AsyncOpenAI):
...
@@ -177,3 +188,48 @@ async def test_metrics_exist(client: openai.AsyncOpenAI):
for
metric
in
EXPECTED_METRICS
:
for
metric
in
EXPECTED_METRICS
:
assert
metric
in
response
.
text
assert
metric
in
response
.
text
def
test_metrics_exist_run_batch
():
input_batch
=
"""{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}"""
# noqa: E501
base_url
=
"0.0.0.0"
port
=
"8001"
server_url
=
f
"http://
{
base_url
}
:
{
port
}
"
with
tempfile
.
NamedTemporaryFile
(
"w"
)
as
input_file
,
tempfile
.
NamedTemporaryFile
(
"r"
)
as
output_file
:
input_file
.
write
(
input_batch
)
input_file
.
flush
()
proc
=
subprocess
.
Popen
([
sys
.
executable
,
"-m"
,
"vllm.entrypoints.openai.run_batch"
,
"-i"
,
input_file
.
name
,
"-o"
,
output_file
.
name
,
"--model"
,
"intfloat/e5-mistral-7b-instruct"
,
"--enable-metrics"
,
"--url"
,
base_url
,
"--port"
,
port
,
],
)
def
is_server_up
(
url
):
try
:
response
=
requests
.
get
(
url
)
return
response
.
status_code
==
200
except
requests
.
ConnectionError
:
return
False
while
not
is_server_up
(
server_url
):
time
.
sleep
(
1
)
response
=
requests
.
get
(
server_url
+
"/metrics"
)
assert
response
.
status_code
==
HTTPStatus
.
OK
proc
.
wait
()
tests/entrypoints/openai/test_models.py
View file @
0640f227
import
openai
# use the official client for correctness check
import
openai
# use the official client for correctness check
import
pytest
import
pytest
import
pytest_asyncio
# downloading lora to test lora requests
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
huggingface_hub
import
snapshot_download
...
@@ -43,9 +44,10 @@ def server(zephyr_lora_files):
...
@@ -43,9 +44,10 @@ def server(zephyr_lora_files):
yield
remote_server
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest_asyncio
.
fixture
def
client
(
server
):
async
def
client
(
server
):
return
server
.
get_async_client
()
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
...
tests/entrypoints/openai/test_return_tokens_as_ids.py
View file @
0640f227
...
@@ -25,59 +25,63 @@ def server_with_return_tokens_as_token_ids_flag(
...
@@ -25,59 +25,63 @@ def server_with_return_tokens_as_token_ids_flag(
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_completion_return_tokens_as_token_ids_completion
(
async
def
test_completion_return_tokens_as_token_ids_completion
(
server_with_return_tokens_as_token_ids_flag
):
server_with_return_tokens_as_token_ids_flag
):
client
=
server_with_return_tokens_as_token_ids_flag
.
get_async_client
()
async
with
server_with_return_tokens_as_token_ids_flag
.
get_async_client
(
)
as
client
:
completion
=
await
client
.
completions
.
create
(
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
# Include Unicode characters to test for dividing a single
# Include Unicode characters to test for dividing a single
# character across multiple tokens: 🎉 is [28705, 31862] for the
# character across multiple tokens: 🎉 is [28705, 31862] for the
# Zephyr tokenizer
# Zephyr tokenizer
prompt
=
"Say 'Hello, world! 🎉'"
,
prompt
=
"Say 'Hello, world! 🎉'"
,
echo
=
True
,
echo
=
True
,
temperature
=
0
,
temperature
=
0
,
max_tokens
=
10
,
max_tokens
=
10
,
logprobs
=
1
)
logprobs
=
1
)
text
=
completion
.
choices
[
0
].
text
text
=
completion
.
choices
[
0
].
text
token_strs
=
completion
.
choices
[
0
].
logprobs
.
tokens
token_strs
=
completion
.
choices
[
0
].
logprobs
.
tokens
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# Check that the token representations are consistent between raw tokens
# Check that the token representations are consistent between raw
# and top_logprobs
# tokens and top_logprobs
# Slice off the first one, because there's no scoring associated with BOS
# Slice off the first one, because there's no scoring associated
top_logprobs
=
completion
.
choices
[
0
].
logprobs
.
top_logprobs
[
1
:]
# with BOS
top_logprob_keys
=
[
top_logprobs
=
completion
.
choices
[
0
].
logprobs
.
top_logprobs
[
1
:]
next
(
iter
(
logprob_by_tokens
))
for
logprob_by_tokens
in
top_logprobs
top_logprob_keys
=
[
]
next
(
iter
(
logprob_by_tokens
))
for
logprob_by_tokens
in
top_logprobs
assert
token_strs
[
1
:]
==
top_logprob_keys
]
assert
token_strs
[
1
:]
==
top_logprob_keys
# Check that decoding the tokens gives the expected text
# Check that decoding the tokens gives the expected text
tokens
=
[
int
(
token
.
removeprefix
(
"token_id:"
))
for
token
in
token_strs
]
tokens
=
[
int
(
token
.
removeprefix
(
"token_id:"
))
for
token
in
token_strs
]
assert
text
==
tokenizer
.
decode
(
tokens
,
skip_special_tokens
=
True
)
assert
text
==
tokenizer
.
decode
(
tokens
,
skip_special_tokens
=
True
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_chat_return_tokens_as_token_ids_completion
(
async
def
test_chat_return_tokens_as_token_ids_completion
(
server_with_return_tokens_as_token_ids_flag
):
server_with_return_tokens_as_token_ids_flag
):
client
=
server_with_return_tokens_as_token_ids_flag
.
get_async_client
()
async
with
server_with_return_tokens_as_token_ids_flag
.
get_async_client
(
response
=
await
client
.
chat
.
completions
.
create
(
)
as
client
:
model
=
MODEL_NAME
,
response
=
await
client
.
chat
.
completions
.
create
(
# Include Unicode characters to test for dividing a single
model
=
MODEL_NAME
,
# character across multiple tokens: 🎉 is [28705, 31862] for the
# Include Unicode characters to test for dividing a single
# Zephyr tokenizer
# character across multiple tokens: 🎉 is [28705, 31862] for the
messages
=
[{
# Zephyr tokenizer
"role"
:
"system"
,
messages
=
[{
"content"
:
"You like to respond in only emojis, like 🎉"
"role"
:
"system"
,
},
{
"content"
:
"You like to respond in only emojis, like 🎉"
"role"
:
"user"
,
},
{
"content"
:
"Please write some emojis: 🐱🐶🎉"
"role"
:
"user"
,
}],
"content"
:
"Please write some emojis: 🐱🐶🎉"
temperature
=
0
,
}],
max_tokens
=
8
,
temperature
=
0
,
logprobs
=
True
)
max_tokens
=
8
,
logprobs
=
True
)
text
=
response
.
choices
[
0
].
message
.
content
text
=
response
.
choices
[
0
].
message
.
content
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
token_ids
=
[]
token_ids
=
[]
for
logprob_content
in
response
.
choices
[
0
].
logprobs
.
content
:
for
logprob_content
in
response
.
choices
[
0
].
logprobs
.
content
:
token_ids
.
append
(
int
(
logprob_content
.
token
.
removeprefix
(
"token_id:"
)))
token_ids
.
append
(
assert
tokenizer
.
decode
(
token_ids
,
skip_special_tokens
=
True
)
==
text
int
(
logprob_content
.
token
.
removeprefix
(
"token_id:"
)))
assert
tokenizer
.
decode
(
token_ids
,
skip_special_tokens
=
True
)
==
text
tests/entrypoints/openai/test_serving_chat.py
View file @
0640f227
...
@@ -3,6 +3,7 @@ from contextlib import suppress
...
@@ -3,6 +3,7 @@ from contextlib import suppress
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
unittest.mock
import
MagicMock
from
unittest.mock
import
MagicMock
from
vllm.config
import
MultiModalConfig
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.entrypoints.openai.protocol
import
ChatCompletionRequest
from
vllm.entrypoints.openai.protocol
import
ChatCompletionRequest
from
vllm.entrypoints.openai.serving_chat
import
OpenAIServingChat
from
vllm.entrypoints.openai.serving_chat
import
OpenAIServingChat
...
@@ -20,6 +21,7 @@ class MockModelConfig:
...
@@ -20,6 +21,7 @@ class MockModelConfig:
max_model_len
=
100
max_model_len
=
100
tokenizer_revision
=
None
tokenizer_revision
=
None
embedding_mode
=
False
embedding_mode
=
False
multimodal_config
=
MultiModalConfig
()
@
dataclass
@
dataclass
...
...
tests/entrypoints/openai/test_shutdown.py
View file @
0640f227
...
@@ -35,13 +35,14 @@ async def test_shutdown_on_engine_failure(tmp_path):
...
@@ -35,13 +35,14 @@ async def test_shutdown_on_engine_failure(tmp_path):
]
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
client
=
remote_server
.
get_async_client
()
async
with
remote_server
.
get_async_client
()
as
client
:
with
pytest
.
raises
(
openai
.
APIConnectionError
):
with
pytest
.
raises
(
# This crashes the engine
(
openai
.
APIConnectionError
,
openai
.
InternalServerError
)):
await
client
.
completions
.
create
(
model
=
"bad-adapter"
,
# This crashes the engine
prompt
=
"Hello, my name is"
)
await
client
.
completions
.
create
(
model
=
"bad-adapter"
,
prompt
=
"Hello, my name is"
)
# Now the server should shut down
# Now the server should shut down
return_code
=
remote_server
.
proc
.
wait
(
timeout
=
1
)
return_code
=
remote_server
.
proc
.
wait
(
timeout
=
3
)
assert
return_code
is
not
None
assert
return_code
is
not
None
tests/entrypoints/openai/test_tokenization.py
View file @
0640f227
import
openai
# use the official client for correctness check
import
openai
# use the official client for correctness check
import
pytest
import
pytest
import
pytest_asyncio
import
requests
import
requests
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
...
@@ -42,9 +43,10 @@ def tokenizer_name(model_name: str,
...
@@ -42,9 +43,10 @@ def tokenizer_name(model_name: str,
model_name
==
"zephyr-lora2"
)
else
model_name
model_name
==
"zephyr-lora2"
)
else
model_name
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest_asyncio
.
fixture
def
client
(
server
):
async
def
client
(
server
):
return
server
.
get_async_client
()
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
...
tests/entrypoints/openai/test_vision.py
View file @
0640f227
...
@@ -2,14 +2,14 @@ from typing import Dict, List
...
@@ -2,14 +2,14 @@ from typing import Dict, List
import
openai
import
openai
import
pytest
import
pytest
import
pytest_asyncio
from
vllm.multimodal.utils
import
encode_image_base64
,
fetch_image
from
vllm.multimodal.utils
import
encode_image_base64
,
fetch_image
from
...utils
import
VLLM_PATH
,
RemoteOpenAIServer
from
...utils
import
RemoteOpenAIServer
MODEL_NAME
=
"llava-hf/llava-1.5-7b-hf"
MODEL_NAME
=
"microsoft/Phi-3.5-vision-instruct"
LLAVA_CHAT_TEMPLATE
=
VLLM_PATH
/
"examples/template_llava.jinja"
MAXIMUM_IMAGES
=
2
assert
LLAVA_CHAT_TEMPLATE
.
exists
()
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS
=
[
TEST_IMAGE_URLS
=
[
...
@@ -23,22 +23,19 @@ TEST_IMAGE_URLS = [
...
@@ -23,22 +23,19 @@ TEST_IMAGE_URLS = [
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
def
server
():
args
=
[
args
=
[
"--dtype"
,
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"4096"
,
"--max-num-seqs"
,
"bfloat16"
,
"5"
,
"--enforce-eager"
,
"--trust-remote-code"
,
"--limit-mm-per-prompt"
,
"--max-model-len"
,
f
"image=
{
MAXIMUM_IMAGES
}
"
"4096"
,
"--enforce-eager"
,
"--chat-template"
,
str
(
LLAVA_CHAT_TEMPLATE
),
]
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest_asyncio
.
fixture
def
client
(
server
):
async
def
client
(
server
):
return
server
.
get_async_client
()
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
fixture
(
scope
=
"session"
)
@
pytest
.
fixture
(
scope
=
"session"
)
...
@@ -82,7 +79,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
...
@@ -82,7 +79,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
choice
=
chat_completion
.
choices
[
0
]
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
assert
choice
.
finish_reason
==
"length"
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
10
,
prompt_tokens
=
596
,
total_tokens
=
606
)
completion_tokens
=
10
,
prompt_tokens
=
772
,
total_tokens
=
782
)
message
=
choice
.
message
message
=
choice
.
message
message
=
chat_completion
.
choices
[
0
].
message
message
=
chat_completion
.
choices
[
0
].
message
...
@@ -137,7 +134,7 @@ async def test_single_chat_session_image_base64encoded(
...
@@ -137,7 +134,7 @@ async def test_single_chat_session_image_base64encoded(
choice
=
chat_completion
.
choices
[
0
]
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
assert
choice
.
finish_reason
==
"length"
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
10
,
prompt_tokens
=
596
,
total_tokens
=
606
)
completion_tokens
=
10
,
prompt_tokens
=
772
,
total_tokens
=
782
)
message
=
choice
.
message
message
=
choice
.
message
message
=
chat_completion
.
choices
[
0
].
message
message
=
chat_completion
.
choices
[
0
].
message
...
@@ -215,26 +212,22 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
...
@@ -215,26 +212,22 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
@
pytest
.
mark
.
parametrize
(
"image_urls"
,
[
TEST_IMAGE_URLS
[:
i
]
for
i
in
range
(
2
,
len
(
TEST_IMAGE_URLS
))])
async
def
test_multi_image_input
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
async
def
test_multi_image_input
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
):
image_url
s
:
List
[
str
]
):
messages
=
[{
messages
=
[{
"role"
:
"role"
:
"user"
,
"user"
,
"content"
:
[
"content"
:
[
{
*
(
{
"type"
:
"image_url"
,
"type"
:
"image_url"
,
"image_url"
:
{
"image_url"
:
{
"url"
:
image_url
"url"
:
image_url
}
}
},
}
for
image_url
in
image_urls
),
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
{
"type"
:
"text"
,
"type"
:
"text"
,
"text"
:
"What's in this image?"
"text"
:
"What's in this image?"
...
@@ -242,20 +235,30 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
...
@@ -242,20 +235,30 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
],
],
}]
}]
with
pytest
.
raises
(
openai
.
BadRequestError
):
# test multi-image input
if
len
(
image_urls
)
>
MAXIMUM_IMAGES
:
await
client
.
chat
.
completions
.
create
(
with
pytest
.
raises
(
openai
.
BadRequestError
):
# test multi-image input
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
)
# the server should still work afterwards
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
completion
=
completion
.
choices
[
0
].
text
assert
completion
is
not
None
and
len
(
completion
)
>=
0
else
:
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
model
=
model_name
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
10
,
max_tokens
=
10
,
temperature
=
0.0
,
temperature
=
0.0
,
)
)
message
=
chat_completion
.
choices
[
0
].
message
# the server should still work afterwards
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
completion
=
completion
.
choices
[
0
].
text
assert
completion
is
not
None
and
len
(
completion
)
>=
0
tests/entrypoints/test_chat_utils.py
0 → 100644
View file @
0640f227
import
warnings
from
typing
import
Optional
import
pytest
from
PIL
import
Image
from
vllm.assets.image
import
ImageAsset
from
vllm.config
import
ModelConfig
from
vllm.entrypoints.chat_utils
import
(
parse_chat_messages
,
parse_chat_messages_futures
)
from
vllm.multimodal
import
MultiModalDataDict
from
vllm.multimodal.utils
import
encode_image_base64
from
vllm.transformers_utils.tokenizer_group
import
TokenizerGroup
PHI3V_MODEL_ID
=
"microsoft/Phi-3.5-vision-instruct"
@
pytest
.
fixture
(
scope
=
"module"
)
def
phi3v_model_config
():
return
ModelConfig
(
PHI3V_MODEL_ID
,
PHI3V_MODEL_ID
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
True
,
dtype
=
"bfloat16"
,
seed
=
0
,
limit_mm_per_prompt
=
{
"image"
:
2
,
})
@
pytest
.
fixture
(
scope
=
"module"
)
def
phi3v_tokenizer
():
return
TokenizerGroup
(
tokenizer_id
=
PHI3V_MODEL_ID
,
enable_lora
=
False
,
max_num_seqs
=
5
,
max_input_length
=
None
,
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
image_url
():
image
=
ImageAsset
(
'cherry_blossom'
)
base64
=
encode_image_base64
(
image
.
pil_image
)
return
f
"data:image/jpeg;base64,
{
base64
}
"
def
_assert_mm_data_is_image_input
(
mm_data
:
Optional
[
MultiModalDataDict
],
image_count
:
int
,
)
->
None
:
assert
mm_data
is
not
None
assert
set
(
mm_data
.
keys
())
==
{
"image"
}
image_data
=
mm_data
.
get
(
"image"
)
assert
image_data
is
not
None
if
image_count
==
1
:
assert
isinstance
(
image_data
,
Image
.
Image
)
else
:
assert
isinstance
(
image_data
,
list
)
and
len
(
image_data
)
==
image_count
def
test_parse_chat_messages_single_image
(
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
):
conversation
,
mm_data
=
parse_chat_messages
([{
"role"
:
"user"
,
"content"
:
[{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
"type"
:
"text"
,
"text"
:
"What's in the image?"
}]
}],
phi3v_model_config
,
phi3v_tokenizer
)
assert
conversation
==
[{
"role"
:
"user"
,
"content"
:
"<|image_1|>
\n
What's in the image?"
}]
_assert_mm_data_is_image_input
(
mm_data
,
1
)
@
pytest
.
mark
.
asyncio
async
def
test_parse_chat_messages_single_image_async
(
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
):
conversation
,
mm_future
=
parse_chat_messages_futures
([{
"role"
:
"user"
,
"content"
:
[{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
"type"
:
"text"
,
"text"
:
"What's in the image?"
}]
}],
phi3v_model_config
,
phi3v_tokenizer
)
assert
conversation
==
[{
"role"
:
"user"
,
"content"
:
"<|image_1|>
\n
What's in the image?"
}]
_assert_mm_data_is_image_input
(
await
mm_future
,
1
)
def
test_parse_chat_messages_multiple_images
(
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
):
conversation
,
mm_data
=
parse_chat_messages
([{
"role"
:
"user"
,
"content"
:
[{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
"type"
:
"text"
,
"text"
:
"What's in these images?"
}]
}],
phi3v_model_config
,
phi3v_tokenizer
)
assert
conversation
==
[{
"role"
:
"user"
,
"content"
:
"<|image_1|>
\n
<|image_2|>
\n
What's in these images?"
}]
_assert_mm_data_is_image_input
(
mm_data
,
2
)
@
pytest
.
mark
.
asyncio
async
def
test_parse_chat_messages_multiple_images_async
(
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
):
conversation
,
mm_future
=
parse_chat_messages_futures
([{
"role"
:
"user"
,
"content"
:
[{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
"type"
:
"text"
,
"text"
:
"What's in these images?"
}]
}],
phi3v_model_config
,
phi3v_tokenizer
)
assert
conversation
==
[{
"role"
:
"user"
,
"content"
:
"<|image_1|>
\n
<|image_2|>
\n
What's in these images?"
}]
_assert_mm_data_is_image_input
(
await
mm_future
,
2
)
def
test_parse_chat_messages_placeholder_already_in_prompt
(
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
):
conversation
,
mm_data
=
parse_chat_messages
([{
"role"
:
"user"
,
"content"
:
[{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
"type"
:
"text"
,
"text"
:
"What's in <|image_1|> and how does it compare to <|image_2|>?"
}]
}],
phi3v_model_config
,
phi3v_tokenizer
)
assert
conversation
==
[{
"role"
:
"user"
,
"content"
:
"What's in <|image_1|> and how does it compare to <|image_2|>?"
}]
_assert_mm_data_is_image_input
(
mm_data
,
2
)
def
test_parse_chat_messages_placeholder_one_already_in_prompt
(
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
):
conversation
,
mm_data
=
parse_chat_messages
([{
"role"
:
"user"
,
"content"
:
[{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
"type"
:
"text"
,
"text"
:
"What's in <|image_1|> and how does it compare to the other one?"
}]
}],
phi3v_model_config
,
phi3v_tokenizer
)
assert
conversation
==
[{
"role"
:
"user"
,
"content"
:
"<|image_2|>
\n
What's in <|image_1|> and how does it compare to the "
"other one?"
}]
_assert_mm_data_is_image_input
(
mm_data
,
2
)
def
test_parse_chat_messages_multiple_images_across_messages
(
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
):
conversation
,
mm_data
=
parse_chat_messages
([{
"role"
:
"user"
,
"content"
:
[{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
}]
},
{
"role"
:
"assistant"
,
"content"
:
"Some stuff."
},
{
"role"
:
"user"
,
"content"
:
[{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
"type"
:
"text"
,
"text"
:
"What about this one?"
}]
}],
phi3v_model_config
,
phi3v_tokenizer
)
assert
conversation
==
[
{
"role"
:
"user"
,
"content"
:
"<|image_1|>
\n
What's in this image?"
},
{
"role"
:
"assistant"
,
"content"
:
"Some stuff."
},
{
"role"
:
"user"
,
"content"
:
"<|image_2|>
\n
What about this one?"
},
]
_assert_mm_data_is_image_input
(
mm_data
,
2
)
def
test_parse_chat_messages_rejects_too_many_images_in_one_message
(
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
):
with
warnings
.
catch_warnings
():
warnings
.
filterwarnings
(
"ignore"
,
message
=
"coroutine 'async_get_and_parse_image' was never awaited"
)
with
pytest
.
raises
(
ValueError
,
match
=
"At most 2 image
\\
(s
\\
) may be provided in one request
\\
."
):
parse_chat_messages
([{
"role"
:
"user"
,
"content"
:
[{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
"type"
:
"text"
,
"text"
:
"What's in these images?"
}]
}],
phi3v_model_config
,
phi3v_tokenizer
)
def
test_parse_chat_messages_rejects_too_many_images_across_messages
(
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
):
with
warnings
.
catch_warnings
():
warnings
.
filterwarnings
(
"ignore"
,
message
=
"coroutine 'async_get_and_parse_image' was never awaited"
)
with
pytest
.
raises
(
ValueError
,
match
=
"At most 2 image
\\
(s
\\
) may be provided in one request
\\
."
):
parse_chat_messages
([{
"role"
:
"user"
,
"content"
:
[{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
}]
},
{
"role"
:
"assistant"
,
"content"
:
"Some stuff."
},
{
"role"
:
"user"
,
"content"
:
[{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
"type"
:
"text"
,
"text"
:
"What about these two?"
}]
}],
phi3v_model_config
,
phi3v_tokenizer
)
tests/kernels/test_awq_triton.py
0 → 100644
View file @
0640f227
"""Tests for the AWQ Triton kernel.
Run `pytest tests/kernels/test_awq_triton.py`.
"""
import
pytest
import
torch
from
vllm.model_executor.layers.quantization.awq_triton
import
(
AWQ_TRITON_SUPPORTED_GROUP_SIZES
,
awq_dequantize_triton
,
awq_gemm_triton
)
device
=
"cuda"
def
reverse_awq_order
(
t
:
torch
.
Tensor
):
bits
=
4
AWQ_REVERSE_ORDER
=
[
0
,
4
,
1
,
5
,
2
,
6
,
3
,
7
]
reverse_order_tensor
=
torch
.
arange
(
t
.
shape
[
-
1
],
dtype
=
torch
.
int32
,
device
=
t
.
device
,
)
reverse_order_tensor
=
reverse_order_tensor
.
view
(
-
1
,
32
//
bits
)
reverse_order_tensor
=
reverse_order_tensor
[:,
AWQ_REVERSE_ORDER
]
reverse_order_tensor
=
reverse_order_tensor
.
view
(
-
1
)
t
=
t
[:,
reverse_order_tensor
]
&
0xF
return
t
# qweights - [R , C // 8], int32
# scales - [R // G, C ], float16
# zeros - [R // G, C // 8], int32
def
awq_dequantize_torch
(
qweight
:
torch
.
Tensor
,
scales
:
torch
.
Tensor
,
qzeros
:
torch
.
Tensor
,
group_size
:
int
)
->
torch
.
Tensor
:
if
group_size
==
-
1
:
group_size
=
qweight
.
shape
[
0
]
bits
=
4
shifts
=
torch
.
arange
(
0
,
32
,
bits
,
device
=
qzeros
.
device
)
iweights
=
torch
.
bitwise_right_shift
(
qweight
[:,
:,
None
],
shifts
[
None
,
None
,
:]).
to
(
torch
.
int8
)
iweights
=
iweights
.
view
(
iweights
.
shape
[
0
],
-
1
)
zeros
=
torch
.
bitwise_right_shift
(
qzeros
[:,
:,
None
],
shifts
[
None
,
None
,
:]).
to
(
torch
.
int8
)
zeros
=
zeros
.
view
(
qzeros
.
shape
[
0
],
-
1
)
zeros
=
reverse_awq_order
(
zeros
)
iweights
=
reverse_awq_order
(
iweights
)
iweights
=
torch
.
bitwise_and
(
iweights
,
(
2
**
bits
)
-
1
)
zeros
=
torch
.
bitwise_and
(
zeros
,
(
2
**
bits
)
-
1
)
scales
=
scales
.
repeat_interleave
(
group_size
,
dim
=
0
)
zeros
=
zeros
.
repeat_interleave
(
group_size
,
dim
=
0
)
return
(
iweights
-
zeros
)
*
scales
# qweights - [R , C // 8], int32
# scales - [R // G, C ], float16
# zeros - [R // G, C // 8], int32
@
pytest
.
mark
.
parametrize
(
"qweight_rows"
,
[
3584
,
18944
,
128
,
256
,
512
,
1024
])
@
pytest
.
mark
.
parametrize
(
"qweight_cols"
,
[
448
,
576
,
4736
,
16
,
32
,
64
,
128
])
@
pytest
.
mark
.
parametrize
(
"group_size"
,
AWQ_TRITON_SUPPORTED_GROUP_SIZES
)
def
test_dequantize
(
qweight_rows
,
qweight_cols
,
group_size
):
if
group_size
==
-
1
:
group_size
=
qweight_rows
qweight_dtype
=
torch
.
int32
scales_rows
=
qweight_rows
//
group_size
scales_cols
=
qweight_cols
*
8
scales_dtype
=
torch
.
float16
zeros_rows
=
scales_rows
zeros_cols
=
qweight_cols
zeros_dtype
=
torch
.
int32
torch
.
manual_seed
(
0
)
qweight
=
torch
.
randint
(
0
,
torch
.
iinfo
(
torch
.
int32
).
max
,
(
qweight_rows
,
qweight_cols
),
dtype
=
qweight_dtype
,
device
=
device
)
scales
=
torch
.
rand
(
scales_rows
,
scales_cols
,
dtype
=
scales_dtype
,
device
=
device
)
zeros
=
torch
.
randint
(
0
,
torch
.
iinfo
(
torch
.
int32
).
max
,
(
zeros_rows
,
zeros_cols
),
dtype
=
zeros_dtype
,
device
=
device
)
iweights_triton
=
awq_dequantize_triton
(
qweight
,
scales
,
zeros
)
assert
(
not
torch
.
any
(
torch
.
isinf
(
iweights_triton
))
and
not
torch
.
any
(
torch
.
isnan
(
iweights_triton
)))
iweights_torch
=
awq_dequantize_torch
(
qweight
,
scales
,
zeros
,
group_size
)
torch
.
testing
.
assert_close
(
iweights_triton
,
iweights_torch
)
# input - [N, K]
# qweight - [K, M // 8]
# qzeros - [K // G, M // 8]
# scales - [K // G, M]
@
pytest
.
mark
.
parametrize
(
"N"
,
[
1
,
2
,
4
,
8
,
14
,
17
,
23
,
32
])
@
pytest
.
mark
.
parametrize
(
"K"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"M"
,
[
16
,
24
,
32
])
@
pytest
.
mark
.
parametrize
(
"group_size"
,
AWQ_TRITON_SUPPORTED_GROUP_SIZES
)
@
pytest
.
mark
.
parametrize
(
"splitK"
,
[
1
,
8
])
def
test_gemm
(
N
,
K
,
M
,
splitK
,
group_size
):
if
group_size
==
-
1
:
group_size
=
K
split_k_iters
=
splitK
input_rows
=
N
input_cols
=
K
input_dtype
=
torch
.
float32
qweight_rows
=
input_cols
qweight_cols
=
M
//
8
scales_rows
=
qweight_rows
//
group_size
scales_cols
=
M
scales_dtype
=
torch
.
float32
qzeros_rows
=
scales_rows
qzeros_cols
=
qweight_cols
torch
.
manual_seed
(
0
)
input
=
torch
.
rand
((
input_rows
,
input_cols
),
dtype
=
input_dtype
,
device
=
device
)
qweight
=
torch
.
randint
(
0
,
torch
.
iinfo
(
torch
.
int32
).
max
,
(
qweight_rows
,
qweight_cols
),
device
=
device
)
qzeros
=
torch
.
randint
(
0
,
torch
.
iinfo
(
torch
.
int32
).
max
,
(
qzeros_rows
,
qzeros_cols
),
device
=
device
)
scales
=
torch
.
rand
((
scales_rows
,
scales_cols
),
dtype
=
scales_dtype
,
device
=
device
)
output_triton
=
awq_gemm_triton
(
input
,
qweight
,
scales
,
qzeros
,
split_k_iters
)
assert
(
not
torch
.
any
(
torch
.
isinf
(
output_triton
))
and
not
torch
.
any
(
torch
.
isnan
(
output_triton
)))
dequantized_weights
=
awq_dequantize_triton
(
qweight
,
scales
,
qzeros
)
output_torch
=
torch
.
matmul
(
input
,
dequantized_weights
)
assert
(
not
torch
.
any
(
torch
.
isinf
(
output_torch
))
and
not
torch
.
any
(
torch
.
isnan
(
output_torch
)))
torch
.
testing
.
assert_close
(
output_triton
.
cpu
(),
output_torch
.
cpu
(),
atol
=
1e-1
,
rtol
=
1e-1
)
Prev
1
2
3
4
5
6
7
8
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment