Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4d3a2c28
Commit
4d3a2c28
authored
Dec 30, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.6.5' into v0.6.5-dev
parents
92ec5d8e
2d1b9baa
Changes
430
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
810 additions
and
83 deletions
+810
-83
tests/entrypoints/openai/test_embedding.py
tests/entrypoints/openai/test_embedding.py
+136
-32
tests/entrypoints/openai/test_metrics.py
tests/entrypoints/openai/test_metrics.py
+15
-14
tests/entrypoints/openai/test_prompt_validation.py
tests/entrypoints/openai/test_prompt_validation.py
+35
-0
tests/entrypoints/openai/test_root_path.py
tests/entrypoints/openai/test_root_path.py
+103
-0
tests/entrypoints/openai/test_score.py
tests/entrypoints/openai/test_score.py
+94
-0
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_chat.py
+10
-1
tests/entrypoints/openai/test_serving_engine.py
tests/entrypoints/openai/test_serving_engine.py
+12
-0
tests/entrypoints/openai/test_shutdown.py
tests/entrypoints/openai/test_shutdown.py
+1
-1
tests/entrypoints/openai/test_tokenization.py
tests/entrypoints/openai/test_tokenization.py
+51
-35
tests/entrypoints/openai/test_video.py
tests/entrypoints/openai/test_video.py
+353
-0
No files found.
Too many changes to show.
To preserve performance only
430 of 430+
files are displayed.
Plain diff
Email patch
tests/entrypoints/openai/test_embedding.py
View file @
4d3a2c28
...
@@ -5,14 +5,18 @@ import openai
...
@@ -5,14 +5,18 @@ import openai
import
pytest
import
pytest
import
os
import
os
import
pytest_asyncio
import
pytest_asyncio
import
requests
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
EMBEDDING_MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"intfloat/e5-mistral-7b-instruct"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"intfloat/e5-mistral-7b-instruct"
)
DUMMY_CHAT_TEMPLATE
=
"""{% for message in messages %}{{message['role'] + ': ' + message['content'] + '
\\
n'}}{% endfor %}"""
# noqa: E501
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
embedding_
server
():
def
server
():
args
=
[
args
=
[
# use half precision for speed and memory savings in CI environment
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"--dtype"
,
...
@@ -20,31 +24,29 @@ def embedding_server():
...
@@ -20,31 +24,29 @@ def embedding_server():
"--enforce-eager"
,
"--enforce-eager"
,
"--max-model-len"
,
"--max-model-len"
,
"8192"
,
"8192"
,
"--chat-template"
,
DUMMY_CHAT_TEMPLATE
,
]
]
with
RemoteOpenAIServer
(
EMBEDDING_
MODEL_NAME
,
args
)
as
remote_server
:
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
yield
remote_server
@
pytest_asyncio
.
fixture
@
pytest_asyncio
.
fixture
async
def
embedding_client
(
embedding_
server
):
async
def
client
(
server
):
async
with
embedding_
server
.
get_async_client
()
as
async_client
:
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
yield
async_client
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
"model_name"
,
async
def
test_single_embedding
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
[
EMBEDDING_MODEL_NAME
],
)
async
def
test_single_embedding
(
embedding_client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
input_texts
=
[
input_texts
=
[
"The chef prepared a delicious meal."
,
"The chef prepared a delicious meal."
,
]
]
# test single embedding
# test single embedding
embeddings
=
await
embedding_
client
.
embeddings
.
create
(
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
model
=
model_name
,
input
=
input_texts
,
input
=
input_texts
,
encoding_format
=
"float"
,
encoding_format
=
"float"
,
...
@@ -58,7 +60,7 @@ async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
...
@@ -58,7 +60,7 @@ async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
# test using token IDs
# test using token IDs
input_tokens
=
[
1
,
1
,
1
,
1
,
1
]
input_tokens
=
[
1
,
1
,
1
,
1
,
1
]
embeddings
=
await
embedding_
client
.
embeddings
.
create
(
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
model
=
model_name
,
input
=
input_tokens
,
input
=
input_tokens
,
encoding_format
=
"float"
,
encoding_format
=
"float"
,
...
@@ -72,18 +74,14 @@ async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
...
@@ -72,18 +74,14 @@ async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
"model_name"
,
async
def
test_batch_embedding
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
[
EMBEDDING_MODEL_NAME
],
)
async
def
test_batch_embedding
(
embedding_client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test List[str]
# test List[str]
input_texts
=
[
input_texts
=
[
"The cat sat on the mat."
,
"A feline was resting on a rug."
,
"The cat sat on the mat."
,
"A feline was resting on a rug."
,
"Stars twinkle brightly in the night sky."
"Stars twinkle brightly in the night sky."
]
]
embeddings
=
await
embedding_
client
.
embeddings
.
create
(
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
model
=
model_name
,
input
=
input_texts
,
input
=
input_texts
,
encoding_format
=
"float"
,
encoding_format
=
"float"
,
...
@@ -91,11 +89,14 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
...
@@ -91,11 +89,14 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
assert
embeddings
.
id
is
not
None
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
3
assert
len
(
embeddings
.
data
)
==
3
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
32
assert
embeddings
.
usage
.
total_tokens
==
32
# test List[List[int]]
# test List[List[int]]
input_tokens
=
[[
4
,
5
,
7
,
9
,
20
],
[
15
,
29
,
499
],
[
24
,
24
,
24
,
24
,
24
],
input_tokens
=
[[
4
,
5
,
7
,
9
,
20
],
[
15
,
29
,
499
],
[
24
,
24
,
24
,
24
,
24
],
[
25
,
32
,
64
,
77
]]
[
25
,
32
,
64
,
77
]]
embeddings
=
await
embedding_
client
.
embeddings
.
create
(
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
model
=
model_name
,
input
=
input_tokens
,
input
=
input_tokens
,
encoding_format
=
"float"
,
encoding_format
=
"float"
,
...
@@ -109,22 +110,70 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
...
@@ -109,22 +110,70 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
"model_name"
,
async
def
test_conversation_embedding
(
server
:
RemoteOpenAIServer
,
[
EMBEDDING_MODEL_NAME
],
client
:
openai
.
AsyncOpenAI
,
)
model_name
:
str
):
async
def
test_batch_base64_embedding
(
embedding_client
:
openai
.
AsyncOpenAI
,
messages
=
[{
"role"
:
"user"
,
"content"
:
"The cat sat on the mat."
,
},
{
"role"
:
"assistant"
,
"content"
:
"A feline was resting on a rug."
,
},
{
"role"
:
"user"
,
"content"
:
"Stars twinkle brightly in the night sky."
,
}]
chat_response
=
requests
.
post
(
server
.
url_for
(
"v1/embeddings"
),
json
=
{
"model"
:
model_name
,
"messages"
:
messages
,
"encoding_format"
:
"float"
,
})
chat_response
.
raise_for_status
()
chat_embeddings
=
chat_response
.
json
()
tokenizer
=
get_tokenizer
(
tokenizer_name
=
model_name
,
tokenizer_mode
=
"fast"
)
prompt
=
tokenizer
.
apply_chat_template
(
messages
,
chat_template
=
DUMMY_CHAT_TEMPLATE
,
add_generation_prompt
=
True
,
continue_final_message
=
False
,
tokenize
=
False
,
)
completion_response
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
input
=
prompt
,
encoding_format
=
"float"
,
# To be consistent with chat
extra_body
=
{
"add_special_tokens"
:
False
},
)
completion_embeddings
=
completion_response
.
model_dump
(
mode
=
"json"
)
assert
chat_embeddings
.
pop
(
"id"
)
is
not
None
assert
completion_embeddings
.
pop
(
"id"
)
is
not
None
assert
chat_embeddings
.
pop
(
"created"
)
<=
completion_embeddings
.
pop
(
"created"
)
assert
chat_embeddings
==
completion_embeddings
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_batch_base64_embedding
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
):
input_texts
=
[
input_texts
=
[
"Hello my name is"
,
"Hello my name is"
,
"The best thing about vLLM is that it supports many different models"
"The best thing about vLLM is that it supports many different models"
]
]
responses_float
=
await
embedding_client
.
embeddings
.
create
(
responses_float
=
await
client
.
embeddings
.
create
(
input
=
input_texts
,
input
=
input_texts
,
model
=
model_name
,
encoding_format
=
"float"
)
model
=
model_name
,
encoding_format
=
"float"
)
responses_base64
=
await
embedding_client
.
embeddings
.
create
(
responses_base64
=
await
client
.
embeddings
.
create
(
input
=
input_texts
,
input
=
input_texts
,
model
=
model_name
,
encoding_format
=
"base64"
)
model
=
model_name
,
encoding_format
=
"base64"
)
decoded_responses_base64_data
=
[]
decoded_responses_base64_data
=
[]
for
data
in
responses_base64
.
data
:
for
data
in
responses_base64
.
data
:
...
@@ -138,10 +187,65 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
...
@@ -138,10 +187,65 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
1
]
1
]
# Default response is float32 decoded from base64 by OpenAI Client
# Default response is float32 decoded from base64 by OpenAI Client
responses_default
=
await
embedding_
client
.
embeddings
.
create
(
responses_default
=
await
client
.
embeddings
.
create
(
input
=
input_texts
,
input
=
input_texts
,
model
=
model_name
)
model
=
model_name
)
assert
responses_float
.
data
[
0
].
embedding
==
responses_default
.
data
[
assert
responses_float
.
data
[
0
].
embedding
==
responses_default
.
data
[
0
].
embedding
0
].
embedding
assert
responses_float
.
data
[
1
].
embedding
==
responses_default
.
data
[
assert
responses_float
.
data
[
1
].
embedding
==
responses_default
.
data
[
1
].
embedding
1
].
embedding
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_single_embedding_truncation
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
input_texts
=
[
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?"
,
]
# test single embedding
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_texts
,
extra_body
=
{
"truncate_prompt_tokens"
:
10
})
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
10
assert
embeddings
.
usage
.
total_tokens
==
10
input_tokens
=
[
1
,
24428
,
289
,
18341
,
26165
,
285
,
19323
,
283
,
289
,
26789
,
3871
,
28728
,
9901
,
340
,
2229
,
385
,
340
,
315
,
28741
,
28804
,
2
]
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_tokens
,
extra_body
=
{
"truncate_prompt_tokens"
:
10
})
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
10
assert
embeddings
.
usage
.
total_tokens
==
10
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_single_embedding_truncation_invalid
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
input_texts
=
[
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?"
,
]
with
pytest
.
raises
(
openai
.
BadRequestError
):
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_texts
,
extra_body
=
{
"truncate_prompt_tokens"
:
8193
})
assert
"error"
in
embeddings
.
object
assert
"truncate_prompt_tokens value is greater than max_model_len. "
\
"Please, select a smaller truncation size."
in
embeddings
.
message
tests/entrypoints/openai/test_metrics.py
View file @
4d3a2c28
...
@@ -71,19 +71,21 @@ EXPECTED_VALUES = {
...
@@ -71,19 +71,21 @@ EXPECTED_VALUES = {
[(
"_sum"
,
_NUM_REQUESTS
*
_NUM_GENERATION_TOKENS_PER_REQUEST
),
[(
"_sum"
,
_NUM_REQUESTS
*
_NUM_GENERATION_TOKENS_PER_REQUEST
),
(
"_count"
,
_NUM_REQUESTS
)],
(
"_count"
,
_NUM_REQUESTS
)],
"vllm:request_params_n"
:
[(
"_count"
,
_NUM_REQUESTS
)],
"vllm:request_params_n"
:
[(
"_count"
,
_NUM_REQUESTS
)],
"vllm:request_params_best_of"
:
[(
"_count"
,
_NUM_REQUESTS
)],
"vllm:request_params_max_tokens"
:
[(
"_sum"
,
_NUM_REQUESTS
*
_NUM_GENERATION_TOKENS_PER_REQUEST
),
(
"_count"
,
_NUM_REQUESTS
)],
"vllm:prompt_tokens"
:
[(
"_total"
,
"vllm:prompt_tokens"
:
[(
"_total"
,
_NUM_REQUESTS
*
_NUM_PROMPT_TOKENS_PER_REQUEST
)],
_NUM_REQUESTS
*
_NUM_PROMPT_TOKENS_PER_REQUEST
)],
"vllm:generation_tokens"
:
"vllm:generation_tokens"
:
[
[(
"_total"
,
_NUM_REQUESTS
*
_NUM_PROMPT_TOKENS_PER_REQUEST
)],
(
"_total"
,
_NUM_REQUESTS
*
_NUM_PROMPT_TOKENS_PER_REQUEST
)
],
"vllm:request_success"
:
[(
"_total"
,
_NUM_REQUESTS
)],
"vllm:request_success"
:
[(
"_total"
,
_NUM_REQUESTS
)],
}
}
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_metrics_counts
(
client
:
openai
.
AsyncOpenAI
):
async
def
test_metrics_counts
(
server
:
RemoteOpenAIServer
,
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
client
:
openai
.
AsyncClient
):
for
_
in
range
(
_NUM_REQUESTS
):
for
_
in
range
(
_NUM_REQUESTS
):
# sending a request triggers the metrics to be logged.
# sending a request triggers the metrics to be logged.
await
client
.
completions
.
create
(
await
client
.
completions
.
create
(
...
@@ -91,7 +93,7 @@ async def test_metrics_counts(client: openai.AsyncOpenAI):
...
@@ -91,7 +93,7 @@ async def test_metrics_counts(client: openai.AsyncOpenAI):
prompt
=
_TOKENIZED_PROMPT
,
prompt
=
_TOKENIZED_PROMPT
,
max_tokens
=
_NUM_GENERATION_TOKENS_PER_REQUEST
)
max_tokens
=
_NUM_GENERATION_TOKENS_PER_REQUEST
)
response
=
requests
.
get
(
base_url
+
"/
metrics"
)
response
=
requests
.
get
(
server
.
url_for
(
"
metrics"
)
)
print
(
response
.
text
)
print
(
response
.
text
)
assert
response
.
status_code
==
HTTPStatus
.
OK
assert
response
.
status_code
==
HTTPStatus
.
OK
...
@@ -152,9 +154,9 @@ EXPECTED_METRICS = [
...
@@ -152,9 +154,9 @@ EXPECTED_METRICS = [
"vllm:request_params_n_sum"
,
"vllm:request_params_n_sum"
,
"vllm:request_params_n_bucket"
,
"vllm:request_params_n_bucket"
,
"vllm:request_params_n_count"
,
"vllm:request_params_n_count"
,
"vllm:request_params_
best_of
_sum"
,
"vllm:request_params_
max_tokens
_sum"
,
"vllm:request_params_
best_of
_bucket"
,
"vllm:request_params_
max_tokens
_bucket"
,
"vllm:request_params_
best_of
_count"
,
"vllm:request_params_
max_tokens
_count"
,
"vllm:num_preemptions_total"
,
"vllm:num_preemptions_total"
,
"vllm:prompt_tokens_total"
,
"vllm:prompt_tokens_total"
,
"vllm:generation_tokens_total"
,
"vllm:generation_tokens_total"
,
...
@@ -175,16 +177,15 @@ EXPECTED_METRICS = [
...
@@ -175,16 +177,15 @@ EXPECTED_METRICS = [
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_metrics_exist
(
client
:
openai
.
AsyncOpenAI
):
async
def
test_metrics_exist
(
server
:
RemoteOpenAIServer
,
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
client
:
openai
.
AsyncClient
):
# sending a request triggers the metrics to be logged.
# sending a request triggers the metrics to be logged.
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Hello, my name is"
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
max_tokens
=
5
,
temperature
=
0.0
)
temperature
=
0.0
)
response
=
requests
.
get
(
base_url
+
"/
metrics"
)
response
=
requests
.
get
(
server
.
url_for
(
"
metrics"
)
)
assert
response
.
status_code
==
HTTPStatus
.
OK
assert
response
.
status_code
==
HTTPStatus
.
OK
for
metric
in
EXPECTED_METRICS
:
for
metric
in
EXPECTED_METRICS
:
...
...
tests/entrypoints/openai/test_prompt_validation.py
View file @
4d3a2c28
...
@@ -21,3 +21,38 @@ async def test_empty_prompt():
...
@@ -21,3 +21,38 @@ async def test_empty_prompt():
prompt
=
""
,
prompt
=
""
,
max_tokens
=
5
,
max_tokens
=
5
,
temperature
=
0.0
)
temperature
=
0.0
)
@
pytest
.
mark
.
asyncio
async
def
test_out_of_vocab_token_ids
():
model_name
=
"gpt2"
server_args
=
[
"--enforce-eager"
]
with
RemoteOpenAIServer
(
model_name
,
server_args
)
as
remote_server
:
client
=
remote_server
.
get_async_client
()
with
pytest
.
raises
(
openai
.
BadRequestError
,
match
=
re
.
compile
(
'.*out of vocabulary.*'
)):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
999999
],
max_tokens
=
5
,
temperature
=
0.0
)
@
pytest
.
mark
.
asyncio
async
def
test_reject_multistep_with_guided_decoding
():
model_name
=
"gpt2"
server_args
=
[
"--enforce-eager"
,
"--num-scheduler-steps"
,
"8"
]
with
RemoteOpenAIServer
(
model_name
,
server_args
)
as
remote_server
:
client
=
remote_server
.
get_async_client
()
with
pytest
.
raises
(
openai
.
BadRequestError
,
match
=
re
.
compile
(
'.*Guided decoding .* multi-step decoding.*'
)):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello"
,
max_tokens
=
5
,
temperature
=
0.0
,
extra_body
=
{
"response_format"
:
{
"type"
:
"json_object"
}})
tests/entrypoints/openai/test_root_path.py
0 → 100644
View file @
4d3a2c28
import
contextlib
import
os
from
typing
import
Any
,
List
,
NamedTuple
import
openai
# use the official client for correctness check
import
pytest
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
# # any model with a chat template should work here
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-1.5B-Instruct"
)
DUMMY_CHAT_TEMPLATE
=
"""{% for message in messages %}{{message['role'] + ': ' + message['content'] + '
\\
n'}}{% endfor %}"""
# noqa: E501
API_KEY
=
"abc-123"
ERROR_API_KEY
=
"abc"
ROOT_PATH
=
"llm"
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"float16"
,
"--enforce-eager"
,
"--max-model-len"
,
"4080"
,
"--root-path"
,
# use --root-path=/llm for testing
"/"
+
ROOT_PATH
,
"--chat-template"
,
DUMMY_CHAT_TEMPLATE
,
]
envs
=
os
.
environ
.
copy
()
envs
[
"VLLM_API_KEY"
]
=
API_KEY
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
,
env_dict
=
envs
)
as
remote_server
:
yield
remote_server
class
TestCase
(
NamedTuple
):
model_name
:
str
base_url
:
List
[
str
]
api_key
:
str
expected_error
:
Any
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"test_case"
,
[
TestCase
(
model_name
=
MODEL_NAME
,
base_url
=
[
"v1"
],
# http://localhost:8000/v1
api_key
=
ERROR_API_KEY
,
expected_error
=
openai
.
AuthenticationError
),
TestCase
(
model_name
=
MODEL_NAME
,
base_url
=
[
ROOT_PATH
,
"v1"
],
# http://localhost:8000/llm/v1
api_key
=
ERROR_API_KEY
,
expected_error
=
openai
.
AuthenticationError
),
TestCase
(
model_name
=
MODEL_NAME
,
base_url
=
[
"v1"
],
# http://localhost:8000/v1
api_key
=
API_KEY
,
expected_error
=
None
),
TestCase
(
model_name
=
MODEL_NAME
,
base_url
=
[
ROOT_PATH
,
"v1"
],
# http://localhost:8000/llm/v1
api_key
=
API_KEY
,
expected_error
=
None
),
],
)
async
def
test_chat_session_root_path_with_api_key
(
server
:
RemoteOpenAIServer
,
test_case
:
TestCase
):
saying
:
str
=
"Here is a common saying about apple. An apple a day, keeps"
ctx
=
contextlib
.
nullcontext
()
if
test_case
.
expected_error
is
not
None
:
ctx
=
pytest
.
raises
(
test_case
.
expected_error
)
with
ctx
:
client
=
openai
.
AsyncOpenAI
(
api_key
=
test_case
.
api_key
,
base_url
=
server
.
url_for
(
*
test_case
.
base_url
),
max_retries
=
0
)
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
test_case
.
model_name
,
messages
=
[{
"role"
:
"user"
,
"content"
:
"tell me a common saying"
},
{
"role"
:
"assistant"
,
"content"
:
saying
}],
extra_body
=
{
"continue_final_message"
:
True
,
"add_generation_prompt"
:
False
})
assert
chat_completion
.
id
is
not
None
assert
len
(
chat_completion
.
choices
)
==
1
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"stop"
message
=
choice
.
message
assert
len
(
message
.
content
)
>
0
assert
message
.
role
==
"assistant"
tests/entrypoints/openai/test_score.py
0 → 100644
View file @
4d3a2c28
import
os
import
pytest
import
requests
from
vllm.entrypoints.openai.protocol
import
ScoreResponse
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-reranker-v2-m3"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
"--enforce-eager"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_text_1_str_text_2_list
(
server
:
RemoteOpenAIServer
,
model_name
:
str
):
text_1
=
"What is the capital of France?"
text_2
=
[
"The capital of Brazil is Brasilia."
,
"The capital of France is Paris."
]
score_response
=
requests
.
post
(
server
.
url_for
(
"score"
),
json
=
{
"model"
:
model_name
,
"text_1"
:
text_1
,
"text_2"
:
text_2
,
})
score_response
.
raise_for_status
()
score
=
ScoreResponse
.
model_validate
(
score_response
.
json
())
assert
score
.
id
is
not
None
assert
score
.
data
is
not
None
assert
len
(
score
.
data
)
==
2
assert
score
.
data
[
0
].
score
<=
0.01
assert
score
.
data
[
1
].
score
>=
0.9
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_text_1_list_text_2_list
(
server
:
RemoteOpenAIServer
,
model_name
:
str
):
text_1
=
[
"What is the capital of the United States?"
,
"What is the capital of France?"
]
text_2
=
[
"The capital of Brazil is Brasilia."
,
"The capital of France is Paris."
]
score_response
=
requests
.
post
(
server
.
url_for
(
"score"
),
json
=
{
"model"
:
model_name
,
"text_1"
:
text_1
,
"text_2"
:
text_2
,
})
score_response
.
raise_for_status
()
score
=
ScoreResponse
.
model_validate
(
score_response
.
json
())
assert
score
.
id
is
not
None
assert
score
.
data
is
not
None
assert
len
(
score
.
data
)
==
2
assert
score
.
data
[
0
].
score
<=
0.01
assert
score
.
data
[
1
].
score
>=
0.9
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_text_1_str_text_2_str
(
server
:
RemoteOpenAIServer
,
model_name
:
str
):
text_1
=
"What is the capital of France?"
text_2
=
"The capital of France is Paris."
score_response
=
requests
.
post
(
server
.
url_for
(
"score"
),
json
=
{
"model"
:
model_name
,
"text_1"
:
text_1
,
"text_2"
:
text_2
,
})
score_response
.
raise_for_status
()
score
=
ScoreResponse
.
model_validate
(
score_response
.
json
())
assert
score
.
id
is
not
None
assert
score
.
data
is
not
None
assert
len
(
score
.
data
)
==
1
assert
score
.
data
[
0
].
score
>=
0.9
tests/entrypoints/openai/test_serving_chat.py
View file @
4d3a2c28
...
@@ -16,15 +16,22 @@ CHAT_TEMPLATE = "Dummy chat template for testing {}"
...
@@ -16,15 +16,22 @@ CHAT_TEMPLATE = "Dummy chat template for testing {}"
BASE_MODEL_PATHS
=
[
BaseModelPath
(
name
=
MODEL_NAME
,
model_path
=
MODEL_NAME
)]
BASE_MODEL_PATHS
=
[
BaseModelPath
(
name
=
MODEL_NAME
,
model_path
=
MODEL_NAME
)]
@
dataclass
class
MockHFConfig
:
model_type
:
str
=
"any"
@
dataclass
@
dataclass
class
MockModelConfig
:
class
MockModelConfig
:
task
=
"generate"
tokenizer
=
MODEL_NAME
tokenizer
=
MODEL_NAME
trust_remote_code
=
False
trust_remote_code
=
False
tokenizer_mode
=
"auto"
tokenizer_mode
=
"auto"
max_model_len
=
100
max_model_len
=
100
tokenizer_revision
=
None
tokenizer_revision
=
None
embedding_mode
=
False
multimodal_config
=
MultiModalConfig
()
multimodal_config
=
MultiModalConfig
()
hf_config
=
MockHFConfig
()
logits_processor_pattern
=
None
@
dataclass
@
dataclass
...
@@ -43,6 +50,7 @@ async def _async_serving_chat_init():
...
@@ -43,6 +50,7 @@ async def _async_serving_chat_init():
BASE_MODEL_PATHS
,
BASE_MODEL_PATHS
,
response_role
=
"assistant"
,
response_role
=
"assistant"
,
chat_template
=
CHAT_TEMPLATE
,
chat_template
=
CHAT_TEMPLATE
,
chat_template_content_format
=
"auto"
,
lora_modules
=
None
,
lora_modules
=
None
,
prompt_adapters
=
None
,
prompt_adapters
=
None
,
request_logger
=
None
)
request_logger
=
None
)
...
@@ -64,6 +72,7 @@ def test_serving_chat_should_set_correct_max_tokens():
...
@@ -64,6 +72,7 @@ def test_serving_chat_should_set_correct_max_tokens():
BASE_MODEL_PATHS
,
BASE_MODEL_PATHS
,
response_role
=
"assistant"
,
response_role
=
"assistant"
,
chat_template
=
CHAT_TEMPLATE
,
chat_template
=
CHAT_TEMPLATE
,
chat_template_content_format
=
"auto"
,
lora_modules
=
None
,
lora_modules
=
None
,
prompt_adapters
=
None
,
prompt_adapters
=
None
,
request_logger
=
None
)
request_logger
=
None
)
...
...
tests/entrypoints/openai/test_serving_engine.py
View file @
4d3a2c28
...
@@ -10,6 +10,8 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse,
...
@@ -10,6 +10,8 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse,
LoadLoraAdapterRequest
,
LoadLoraAdapterRequest
,
UnloadLoraAdapterRequest
)
UnloadLoraAdapterRequest
)
from
vllm.entrypoints.openai.serving_engine
import
BaseModelPath
,
OpenAIServing
from
vllm.entrypoints.openai.serving_engine
import
BaseModelPath
,
OpenAIServing
from
vllm.lora.request
import
LoRARequest
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-2-7b"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-2-7b"
)
...
@@ -35,6 +37,16 @@ async def _async_serving_engine_init():
...
@@ -35,6 +37,16 @@ async def _async_serving_engine_init():
return
serving_engine
return
serving_engine
@
pytest
.
mark
.
asyncio
async
def
test_serving_model_name
():
serving_engine
=
await
_async_serving_engine_init
()
assert
serving_engine
.
_get_model_name
(
None
)
==
MODEL_NAME
request
=
LoRARequest
(
lora_name
=
"adapter"
,
lora_path
=
"/path/to/adapter2"
,
lora_int_id
=
1
)
assert
serving_engine
.
_get_model_name
(
request
)
==
request
.
lora_name
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_load_lora_adapter_success
():
async
def
test_load_lora_adapter_success
():
serving_engine
=
await
_async_serving_engine_init
()
serving_engine
=
await
_async_serving_engine_init
()
...
...
tests/entrypoints/openai/test_shutdown.py
View file @
4d3a2c28
...
@@ -6,7 +6,7 @@ import pytest
...
@@ -6,7 +6,7 @@ import pytest
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"
HuggingFaceH4/zephyr-7b-beta
"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"
meta-llama/Llama-3.2-1B
"
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
...
tests/entrypoints/openai/test_tokenization.py
View file @
4d3a2c28
import
openai
# use the official client for correctness check
import
pytest
import
pytest
import
os
import
os
import
pytest_asyncio
import
pytest_asyncio
...
@@ -57,9 +56,11 @@ async def client(server):
...
@@ -57,9 +56,11 @@ async def client(server):
[(
MODEL_NAME
,
MODEL_NAME
),
(
"zephyr-lora2"
,
"zephyr-lora2"
)],
[(
MODEL_NAME
,
MODEL_NAME
),
(
"zephyr-lora2"
,
"zephyr-lora2"
)],
indirect
=
[
"tokenizer_name"
],
indirect
=
[
"tokenizer_name"
],
)
)
async
def
test_tokenize_completions
(
client
:
openai
.
AsyncOpenAI
,
async
def
test_tokenize_completions
(
model_name
:
str
,
tokenizer_name
:
str
):
server
:
RemoteOpenAIServer
,
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
model_name
:
str
,
tokenizer_name
:
str
,
):
tokenizer
=
get_tokenizer
(
tokenizer_name
=
tokenizer_name
,
tokenizer
=
get_tokenizer
(
tokenizer_name
=
tokenizer_name
,
tokenizer_mode
=
"fast"
)
tokenizer_mode
=
"fast"
)
...
@@ -67,7 +68,7 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI,
...
@@ -67,7 +68,7 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI,
prompt
=
"vllm1 This is a test prompt."
prompt
=
"vllm1 This is a test prompt."
tokens
=
tokenizer
.
encode
(
prompt
,
add_special_tokens
=
add_special
)
tokens
=
tokenizer
.
encode
(
prompt
,
add_special_tokens
=
add_special
)
response
=
requests
.
post
(
base_url
+
"/
tokenize"
,
response
=
requests
.
post
(
server
.
url_for
(
"
tokenize"
)
,
json
=
{
json
=
{
"add_special_tokens"
:
add_special
,
"add_special_tokens"
:
add_special
,
"model"
:
model_name
,
"model"
:
model_name
,
...
@@ -88,9 +89,11 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI,
...
@@ -88,9 +89,11 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI,
[(
MODEL_NAME
,
MODEL_NAME
),
(
"zephyr-lora2"
,
"zephyr-lora2"
)],
[(
MODEL_NAME
,
MODEL_NAME
),
(
"zephyr-lora2"
,
"zephyr-lora2"
)],
indirect
=
[
"tokenizer_name"
],
indirect
=
[
"tokenizer_name"
],
)
)
async
def
test_tokenize_chat
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
async
def
test_tokenize_chat
(
tokenizer_name
:
str
):
server
:
RemoteOpenAIServer
,
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
model_name
:
str
,
tokenizer_name
:
str
,
):
tokenizer
=
get_tokenizer
(
tokenizer_name
=
tokenizer_name
,
tokenizer
=
get_tokenizer
(
tokenizer_name
=
tokenizer_name
,
tokenizer_mode
=
"fast"
)
tokenizer_mode
=
"fast"
)
...
@@ -106,28 +109,40 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
...
@@ -106,28 +109,40 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
"role"
:
"user"
,
"role"
:
"user"
,
"content"
:
"Can I ask a question? vllm1"
"content"
:
"Can I ask a question? vllm1"
}]
}]
for
continue_final
in
[
False
,
True
]:
prompt
=
tokenizer
.
apply_chat_template
(
if
add_generation
and
continue_final
:
add_generation_prompt
=
add_generation
,
continue
conversation
=
conversation
,
if
continue_final
:
tokenize
=
False
)
conversation
.
append
({
tokens
=
tokenizer
.
encode
(
prompt
,
add_special_tokens
=
add_special
)
"role"
:
"assistant"
,
"content"
:
"Sure,"
response
=
requests
.
post
(
base_url
+
"/tokenize"
,
})
json
=
{
"add_generation_prompt"
:
prompt
=
tokenizer
.
apply_chat_template
(
add_generation
,
add_generation_prompt
=
add_generation
,
"add_special_tokens"
:
add_special
,
continue_final_message
=
continue_final
,
"messages"
:
conversation
,
conversation
=
conversation
,
"model"
:
model_name
tokenize
=
False
)
})
tokens
=
tokenizer
.
encode
(
prompt
,
response
.
raise_for_status
()
add_special_tokens
=
add_special
)
assert
response
.
json
()
==
{
response
=
requests
.
post
(
server
.
url_for
(
"tokenize"
),
"tokens"
:
tokens
,
json
=
{
"count"
:
len
(
tokens
),
"add_generation_prompt"
:
"max_model_len"
:
8192
add_generation
,
}
"continue_final_message"
:
continue_final
,
"add_special_tokens"
:
add_special
,
"messages"
:
conversation
,
"model"
:
model_name
})
response
.
raise_for_status
()
assert
response
.
json
()
==
{
"tokens"
:
tokens
,
"count"
:
len
(
tokens
),
"max_model_len"
:
8192
}
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
@@ -136,17 +151,18 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
...
@@ -136,17 +151,18 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
[(
MODEL_NAME
,
MODEL_NAME
),
(
"zephyr-lora2"
,
"zephyr-lora2"
)],
[(
MODEL_NAME
,
MODEL_NAME
),
(
"zephyr-lora2"
,
"zephyr-lora2"
)],
indirect
=
[
"tokenizer_name"
],
indirect
=
[
"tokenizer_name"
],
)
)
async
def
test_detokenize
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
async
def
test_detokenize
(
tokenizer_name
:
str
):
server
:
RemoteOpenAIServer
,
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
model_name
:
str
,
tokenizer_name
:
str
,
):
tokenizer
=
get_tokenizer
(
tokenizer_name
=
tokenizer_name
,
tokenizer
=
get_tokenizer
(
tokenizer_name
=
tokenizer_name
,
tokenizer_mode
=
"fast"
)
tokenizer_mode
=
"fast"
)
prompt
=
"This is a test prompt. vllm1"
prompt
=
"This is a test prompt. vllm1"
tokens
=
tokenizer
.
encode
(
prompt
,
add_special_tokens
=
False
)
tokens
=
tokenizer
.
encode
(
prompt
,
add_special_tokens
=
False
)
print
(
f
"CALLING
{
base_url
}
FOR
{
model_name
}
"
)
response
=
requests
.
post
(
server
.
url_for
(
"detokenize"
),
response
=
requests
.
post
(
base_url
+
"/detokenize"
,
json
=
{
json
=
{
"model"
:
model_name
,
"model"
:
model_name
,
"tokens"
:
tokens
"tokens"
:
tokens
...
...
tests/entrypoints/openai/test_video.py
0 → 100644
View file @
4d3a2c28
from
typing
import
Dict
,
List
import
os
import
openai
import
pytest
import
pytest_asyncio
from
vllm.multimodal.utils
import
encode_video_base64
,
fetch_video
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
,
urls_port
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
)
MAXIMUM_VIDEOS
=
4
# TEST_VIDEO_URLS = [
# "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4",
# "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ElephantsDream.mp4",
# "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerBlazes.mp4",
# "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4",
# ]
TEST_VIDEO_URLS
=
[
f
"http://localhost:
{
urls_port
}
/BigBuckBunny.mp4"
,
f
"http://localhost:
{
urls_port
}
/ElephantsDream.mp4"
,
f
"http://localhost:
{
urls_port
}
/ForBiggerBlazes.mp4"
,
f
"http://localhost:
{
urls_port
}
/ForBiggerFun.mp4"
,
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
"--task"
,
"generate"
,
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"32768"
,
"--max-num-seqs"
,
"2"
,
"--enforce-eager"
,
"--trust-remote-code"
,
"--limit-mm-per-prompt"
,
f
"video=
{
MAXIMUM_VIDEOS
}
"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
client
(
server
):
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
fixture
(
scope
=
"session"
)
def
base64_encoded_video
()
->
Dict
[
str
,
str
]:
return
{
video_url
:
encode_video_base64
(
fetch_video
(
video_url
))
for
video_url
in
TEST_VIDEO_URLS
}
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"video_url"
,
TEST_VIDEO_URLS
)
async
def
test_single_chat_session_video
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
video_url
:
str
):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"video_url"
,
"video_url"
:
{
"url"
:
video_url
}
},
{
"type"
:
"text"
,
"text"
:
"What's in this video?"
},
],
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_completion_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
)
assert
len
(
chat_completion
.
choices
)
==
1
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
10
,
prompt_tokens
=
6299
,
total_tokens
=
6309
)
message
=
choice
.
message
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
10
assert
message
.
role
==
"assistant"
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
message
.
content
})
# test multi-turn dialogue
messages
.
append
({
"role"
:
"user"
,
"content"
:
"express your result in json"
})
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_completion_tokens
=
10
,
)
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"video_url"
,
TEST_VIDEO_URLS
)
async
def
test_single_chat_session_video_beamsearch
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
video_url
:
str
):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"video_url"
,
"video_url"
:
{
"url"
:
video_url
}
},
{
"type"
:
"text"
,
"text"
:
"What's in this video?"
},
],
}]
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
n
=
2
,
max_completion_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
,
extra_body
=
dict
(
use_beam_search
=
True
))
assert
len
(
chat_completion
.
choices
)
==
2
assert
chat_completion
.
choices
[
0
].
message
.
content
!=
chat_completion
.
choices
[
1
].
message
.
content
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"video_url"
,
TEST_VIDEO_URLS
)
async
def
test_single_chat_session_video_base64encoded
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
video_url
:
str
,
base64_encoded_video
:
Dict
[
str
,
str
]):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"video_url"
,
"video_url"
:
{
"url"
:
f
"data:video/jpeg;base64,
{
base64_encoded_video
[
video_url
]
}
"
}
},
{
"type"
:
"text"
,
"text"
:
"What's in this video?"
},
],
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_completion_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
)
assert
len
(
chat_completion
.
choices
)
==
1
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
10
,
prompt_tokens
=
6299
,
total_tokens
=
6309
)
message
=
choice
.
message
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
10
assert
message
.
role
==
"assistant"
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
message
.
content
})
# test multi-turn dialogue
messages
.
append
({
"role"
:
"user"
,
"content"
:
"express your result in json"
})
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_completion_tokens
=
10
,
)
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"video_url"
,
TEST_VIDEO_URLS
)
async
def
test_single_chat_session_video_base64encoded_beamsearch
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
video_url
:
str
,
base64_encoded_video
:
Dict
[
str
,
str
]):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"video_url"
,
"video_url"
:
{
"url"
:
f
"data:video/jpeg;base64,
{
base64_encoded_video
[
video_url
]
}
"
}
},
{
"type"
:
"text"
,
"text"
:
"What's in this video?"
},
],
}]
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
n
=
2
,
max_completion_tokens
=
10
,
extra_body
=
dict
(
use_beam_search
=
True
))
assert
len
(
chat_completion
.
choices
)
==
2
assert
chat_completion
.
choices
[
0
].
message
.
content
!=
chat_completion
.
choices
[
1
].
message
.
content
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"video_url"
,
TEST_VIDEO_URLS
)
async
def
test_chat_streaming_video
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
video_url
:
str
):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"video_url"
,
"video_url"
:
{
"url"
:
video_url
}
},
{
"type"
:
"text"
,
"text"
:
"What's in this video?"
},
],
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_completion_tokens
=
10
,
temperature
=
0.0
,
)
output
=
chat_completion
.
choices
[
0
].
message
.
content
stop_reason
=
chat_completion
.
choices
[
0
].
finish_reason
# test streaming
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_completion_tokens
=
10
,
temperature
=
0.0
,
stream
=
True
,
)
chunks
:
List
[
str
]
=
[]
finish_reason_count
=
0
async
for
chunk
in
stream
:
delta
=
chunk
.
choices
[
0
].
delta
if
delta
.
role
:
assert
delta
.
role
==
"assistant"
if
delta
.
content
:
chunks
.
append
(
delta
.
content
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finish_reason_count
+=
1
# finish reason should only return in last block
assert
finish_reason_count
==
1
assert
chunk
.
choices
[
0
].
finish_reason
==
stop_reason
assert
delta
.
content
assert
""
.
join
(
chunks
)
==
output
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"video_urls"
,
[
TEST_VIDEO_URLS
[:
i
]
for
i
in
range
(
2
,
len
(
TEST_VIDEO_URLS
))])
async
def
test_multi_video_input
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
video_urls
:
List
[
str
]):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
*
({
"type"
:
"video_url"
,
"video_url"
:
{
"url"
:
video_url
}
}
for
video_url
in
video_urls
),
{
"type"
:
"text"
,
"text"
:
"What's in this video?"
},
],
}]
if
len
(
video_urls
)
>
MAXIMUM_VIDEOS
:
with
pytest
.
raises
(
openai
.
BadRequestError
):
# test multi-video input
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_completion_tokens
=
10
,
temperature
=
0.0
,
)
# the server should still work afterwards
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
completion
=
completion
.
choices
[
0
].
text
assert
completion
is
not
None
and
len
(
completion
)
>=
0
else
:
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_completion_tokens
=
10
,
temperature
=
0.0
,
)
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
Prev
1
…
18
19
20
21
22
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment