Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e7c1b7f3
Commit
e7c1b7f3
authored
Sep 06, 2024
by
zhuwenwen
Browse files
Merge branch 'v0.5.4-dtk24.04.1'
parents
7462218e
04c62b93
Changes
442
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2129 additions
and
725 deletions
+2129
-725
tests/engine/output_processor/test_multi_step.py
tests/engine/output_processor/test_multi_step.py
+4
-4
tests/engine/output_processor/test_stop_checker.py
tests/engine/output_processor/test_stop_checker.py
+2
-2
tests/engine/test_custom_executor.py
tests/engine/test_custom_executor.py
+91
-0
tests/entrypoints/conftest.py
tests/entrypoints/conftest.py
+89
-0
tests/entrypoints/llm/__init__.py
tests/entrypoints/llm/__init__.py
+0
-0
tests/entrypoints/llm/test_encode.py
tests/entrypoints/llm/test_encode.py
+1
-3
tests/entrypoints/llm/test_generate.py
tests/entrypoints/llm/test_generate.py
+1
-3
tests/entrypoints/llm/test_generate_multiple_loras.py
tests/entrypoints/llm/test_generate_multiple_loras.py
+2
-4
tests/entrypoints/llm/test_guided_generate.py
tests/entrypoints/llm/test_guided_generate.py
+142
-0
tests/entrypoints/openai/__init__.py
tests/entrypoints/openai/__init__.py
+0
-0
tests/entrypoints/openai/test_basic.py
tests/entrypoints/openai/test_basic.py
+61
-0
tests/entrypoints/openai/test_chat.py
tests/entrypoints/openai/test_chat.py
+109
-634
tests/entrypoints/openai/test_completion.py
tests/entrypoints/openai/test_completion.py
+710
-0
tests/entrypoints/openai/test_disable_mp.py
tests/entrypoints/openai/test_disable_mp.py
+715
-0
tests/entrypoints/openai/test_embedding.py
tests/entrypoints/openai/test_embedding.py
+40
-17
tests/entrypoints/openai/test_guided_processors.py
tests/entrypoints/openai/test_guided_processors.py
+13
-54
tests/entrypoints/openai/test_models.py
tests/entrypoints/openai/test_models.py
+60
-0
tests/entrypoints/openai/test_oot_registration.py
tests/entrypoints/openai/test_oot_registration.py
+4
-3
tests/entrypoints/openai/test_return_tokens_as_ids.py
tests/entrypoints/openai/test_return_tokens_as_ids.py
+83
-0
tests/entrypoints/openai/test_run_batch.py
tests/entrypoints/openai/test_run_batch.py
+2
-1
No files found.
Too many changes to show.
To preserve performance only
442 of 442+
files are displayed.
Plain diff
Email patch
tests/engine/output_processor/test_multi_step.py
View file @
e7c1b7f3
...
@@ -32,7 +32,7 @@ def test_appends_token_ids(num_new_tokens: int, seq_output_len: int):
...
@@ -32,7 +32,7 @@ def test_appends_token_ids(num_new_tokens: int, seq_output_len: int):
output_processor
=
MultiStepOutputProcessor
(
output_processor
=
MultiStepOutputProcessor
(
detokenizer
=
detokenizer
,
detokenizer
=
detokenizer
,
scheduler
=
scheduler
,
scheduler
=
[
scheduler
]
,
seq_counter
=
seq_counter
,
seq_counter
=
seq_counter
,
get_tokenizer_for_seq
=
lambda
_
:
mock_tokenizer
(),
get_tokenizer_for_seq
=
lambda
_
:
mock_tokenizer
(),
stop_checker
=
stop_checker
,
stop_checker
=
stop_checker
,
...
@@ -86,7 +86,7 @@ def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,
...
@@ -86,7 +86,7 @@ def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,
output_processor
=
MultiStepOutputProcessor
(
output_processor
=
MultiStepOutputProcessor
(
detokenizer
=
detokenizer
,
detokenizer
=
detokenizer
,
scheduler
=
scheduler
,
scheduler
=
[
scheduler
]
,
seq_counter
=
seq_counter
,
seq_counter
=
seq_counter
,
get_tokenizer_for_seq
=
lambda
_
:
mock_tokenizer
(),
get_tokenizer_for_seq
=
lambda
_
:
mock_tokenizer
(),
stop_checker
=
stop_checker
,
stop_checker
=
stop_checker
,
...
@@ -148,7 +148,7 @@ def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
...
@@ -148,7 +148,7 @@ def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
output_processor
=
MultiStepOutputProcessor
(
output_processor
=
MultiStepOutputProcessor
(
detokenizer
=
detokenizer
,
detokenizer
=
detokenizer
,
scheduler
=
scheduler
,
scheduler
=
[
scheduler
]
,
seq_counter
=
seq_counter
,
seq_counter
=
seq_counter
,
get_tokenizer_for_seq
=
lambda
_
:
mock_tokenizer
(
eos_token_id
),
get_tokenizer_for_seq
=
lambda
_
:
mock_tokenizer
(
eos_token_id
),
stop_checker
=
stop_checker
,
stop_checker
=
stop_checker
,
...
@@ -215,7 +215,7 @@ def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
...
@@ -215,7 +215,7 @@ def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
output_processor
=
MultiStepOutputProcessor
(
output_processor
=
MultiStepOutputProcessor
(
detokenizer
=
detokenizer
,
detokenizer
=
detokenizer
,
scheduler
=
scheduler
,
scheduler
=
[
scheduler
]
,
seq_counter
=
seq_counter
,
seq_counter
=
seq_counter
,
get_tokenizer_for_seq
=
lambda
_
:
mock_tokenizer
(
eos_token_id
),
get_tokenizer_for_seq
=
lambda
_
:
mock_tokenizer
(
eos_token_id
),
stop_checker
=
stop_checker
,
stop_checker
=
stop_checker
,
...
...
tests/engine/output_processor/test_stop_checker.py
View file @
e7c1b7f3
...
@@ -35,8 +35,8 @@ def sequence_with_eos(text: str, eos_token: str,
...
@@ -35,8 +35,8 @@ def sequence_with_eos(text: str, eos_token: str,
@
pytest
.
mark
.
parametrize
([
"text_wo_eos"
,
"eos_token"
,
"eos_token_id"
],
[
@
pytest
.
mark
.
parametrize
([
"text_wo_eos"
,
"eos_token"
,
"eos_token_id"
],
[
(
"This text ends with EOS token"
,
"</s>"
,
2
),
(
"This text ends with EOS token"
,
"</s>"
,
2
),
])
])
@
pytest
.
mark
.
parametrize
(
"ignore_eos"
,
[
True
,
False
,
None
])
@
pytest
.
mark
.
parametrize
(
"ignore_eos"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"include_stop_str_in_output"
,
[
True
,
False
,
None
])
@
pytest
.
mark
.
parametrize
(
"include_stop_str_in_output"
,
[
True
,
False
])
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
def
test_stop_on_eos_token
(
text_wo_eos
:
str
,
eos_token
:
str
,
eos_token_id
:
int
,
def
test_stop_on_eos_token
(
text_wo_eos
:
str
,
eos_token
:
str
,
eos_token_id
:
int
,
ignore_eos
:
bool
,
include_stop_str_in_output
:
bool
):
ignore_eos
:
bool
,
include_stop_str_in_output
:
bool
):
...
...
tests/engine/test_custom_executor.py
0 → 100644
View file @
e7c1b7f3
import
asyncio
import
os
import
pytest
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
EngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.executor.gpu_executor
import
GPUExecutor
,
GPUExecutorAsync
from
vllm.sampling_params
import
SamplingParams
class
Mock
:
...
class
CustomGPUExecutor
(
GPUExecutor
):
def
execute_model
(
self
,
*
args
,
**
kwargs
):
# Drop marker to show that this was ran
with
open
(
".marker"
,
"w"
):
...
return
super
().
execute_model
(
*
args
,
**
kwargs
)
class
CustomGPUExecutorAsync
(
GPUExecutorAsync
):
async
def
execute_model_async
(
self
,
*
args
,
**
kwargs
):
with
open
(
".marker"
,
"w"
):
...
return
await
super
().
execute_model_async
(
*
args
,
**
kwargs
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
def
test_custom_executor_type_checking
(
model
):
with
pytest
.
raises
(
ValueError
):
engine_args
=
EngineArgs
(
model
=
model
,
distributed_executor_backend
=
Mock
)
LLMEngine
.
from_engine_args
(
engine_args
)
with
pytest
.
raises
(
ValueError
):
engine_args
=
AsyncEngineArgs
(
model
=
model
,
distributed_executor_backend
=
Mock
)
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
with
pytest
.
raises
(
TypeError
):
engine_args
=
AsyncEngineArgs
(
model
=
model
,
distributed_executor_backend
=
CustomGPUExecutor
)
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
def
test_custom_executor
(
model
,
tmpdir
):
cwd
=
os
.
path
.
abspath
(
"."
)
os
.
chdir
(
tmpdir
)
try
:
assert
not
os
.
path
.
exists
(
".marker"
)
engine_args
=
EngineArgs
(
model
=
model
,
distributed_executor_backend
=
CustomGPUExecutor
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
engine
.
add_request
(
"0"
,
"foo"
,
sampling_params
)
engine
.
step
()
assert
os
.
path
.
exists
(
".marker"
)
finally
:
os
.
chdir
(
cwd
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
def
test_custom_executor_async
(
model
,
tmpdir
):
cwd
=
os
.
path
.
abspath
(
"."
)
os
.
chdir
(
tmpdir
)
try
:
assert
not
os
.
path
.
exists
(
".marker"
)
engine_args
=
AsyncEngineArgs
(
model
=
model
,
distributed_executor_backend
=
CustomGPUExecutorAsync
)
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
async
def
t
():
stream
=
await
engine
.
add_request
(
"0"
,
"foo"
,
sampling_params
)
async
for
x
in
stream
:
...
asyncio
.
run
(
t
())
assert
os
.
path
.
exists
(
".marker"
)
finally
:
os
.
chdir
(
cwd
)
tests/entrypoints/conftest.py
0 → 100644
View file @
e7c1b7f3
import
pytest
@
pytest
.
fixture
def
sample_prompts
():
return
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
@
pytest
.
fixture
def
sample_token_ids
():
return
[
[
0
],
[
0
,
1
],
[
0
,
2
,
1
],
[
0
,
3
,
1
,
2
],
]
@
pytest
.
fixture
def
sample_regex
():
return
(
r
"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
r
"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
)
@
pytest
.
fixture
def
sample_json_schema
():
return
{
"type"
:
"object"
,
"properties"
:
{
"name"
:
{
"type"
:
"string"
},
"age"
:
{
"type"
:
"integer"
},
"skills"
:
{
"type"
:
"array"
,
"items"
:
{
"type"
:
"string"
,
"maxLength"
:
10
},
"minItems"
:
3
},
"work_history"
:
{
"type"
:
"array"
,
"items"
:
{
"type"
:
"object"
,
"properties"
:
{
"company"
:
{
"type"
:
"string"
},
"duration"
:
{
"type"
:
"number"
},
"position"
:
{
"type"
:
"string"
}
},
"required"
:
[
"company"
,
"position"
]
}
}
},
"required"
:
[
"name"
,
"age"
,
"skills"
,
"work_history"
]
}
@
pytest
.
fixture
def
sample_guided_choice
():
return
[
"Python"
,
"Java"
,
"JavaScript"
,
"C++"
,
"C#"
,
"PHP"
,
"TypeScript"
,
"Ruby"
,
"Swift"
,
"Kotlin"
]
@
pytest
.
fixture
def
sample_sql_statements
():
return
(
"""
start: select_statement
select_statement: "SELECT" column "from" table "where" condition
column: "col_1" | "col_2"
table: "table_1" | "table_2"
condition: column "=" number
number: "1" | "2"
"""
)
tests/entrypoints/llm/__init__.py
0 → 100644
View file @
e7c1b7f3
tests/entrypoints/test
_llm
_encode.py
→
tests/entrypoints/
llm/
test_encode.py
View file @
e7c1b7f3
...
@@ -5,7 +5,7 @@ import pytest
...
@@ -5,7 +5,7 @@ import pytest
from
vllm
import
LLM
,
EmbeddingRequestOutput
,
PoolingParams
from
vllm
import
LLM
,
EmbeddingRequestOutput
,
PoolingParams
from
..conftest
import
cleanup
from
..
.
conftest
import
cleanup
MODEL_NAME
=
"intfloat/e5-mistral-7b-instruct"
MODEL_NAME
=
"intfloat/e5-mistral-7b-instruct"
...
@@ -25,8 +25,6 @@ TOKEN_IDS = [
...
@@ -25,8 +25,6 @@ TOKEN_IDS = [
[
1000
,
1003
,
1001
,
1002
],
[
1000
,
1003
,
1001
,
1002
],
]
]
pytestmark
=
pytest
.
mark
.
llm
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
llm
():
def
llm
():
...
...
tests/entrypoints/test
_llm
_generate.py
→
tests/entrypoints/
llm/
test_generate.py
View file @
e7c1b7f3
...
@@ -5,7 +5,7 @@ import pytest
...
@@ -5,7 +5,7 @@ import pytest
from
vllm
import
LLM
,
RequestOutput
,
SamplingParams
from
vllm
import
LLM
,
RequestOutput
,
SamplingParams
from
..conftest
import
cleanup
from
..
.
conftest
import
cleanup
MODEL_NAME
=
"facebook/opt-125m"
MODEL_NAME
=
"facebook/opt-125m"
...
@@ -23,8 +23,6 @@ TOKEN_IDS = [
...
@@ -23,8 +23,6 @@ TOKEN_IDS = [
[
0
,
3
,
1
,
2
],
[
0
,
3
,
1
,
2
],
]
]
pytestmark
=
pytest
.
mark
.
llm
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
llm
():
def
llm
():
...
...
tests/entrypoints/test
_llm
_generate_multiple_loras.py
→
tests/entrypoints/
llm/
test_generate_multiple_loras.py
View file @
e7c1b7f3
...
@@ -7,7 +7,7 @@ from huggingface_hub import snapshot_download
...
@@ -7,7 +7,7 @@ from huggingface_hub import snapshot_download
from
vllm
import
LLM
from
vllm
import
LLM
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
..conftest
import
cleanup
from
..
.
conftest
import
cleanup
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
...
@@ -20,8 +20,6 @@ PROMPTS = [
...
@@ -20,8 +20,6 @@ PROMPTS = [
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
pytestmark
=
pytest
.
mark
.
llm
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
llm
():
def
llm
():
...
@@ -44,7 +42,7 @@ def llm():
...
@@ -44,7 +42,7 @@ def llm():
cleanup
()
cleanup
()
@
pytest
.
fixture
(
scope
=
"
session
"
)
@
pytest
.
fixture
(
scope
=
"
module
"
)
def
zephyr_lora_files
():
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
return
snapshot_download
(
repo_id
=
LORA_NAME
)
...
...
tests/entrypoints/llm/test_guided_generate.py
0 → 100644
View file @
e7c1b7f3
import
json
import
re
import
weakref
import
jsonschema
import
pytest
from
vllm.entrypoints.llm
import
LLM
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
SamplingParams
from
...conftest
import
cleanup
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
@
pytest
.
fixture
(
scope
=
"module"
)
def
llm
():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
max_model_len
=
1024
)
with
llm
.
deprecate_legacy_api
():
yield
weakref
.
proxy
(
llm
)
del
llm
cleanup
()
@
pytest
.
mark
.
skip_global_cleanup
def
test_guided_regex
(
sample_regex
,
llm
):
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
)
outputs
=
llm
.
generate
(
prompts
=
[
f
"Give an example IPv4 address with this regex:
{
sample_regex
}
"
]
*
2
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_regex
=
sample_regex
))
assert
outputs
is
not
None
for
output
in
outputs
:
assert
output
is
not
None
assert
isinstance
(
output
,
RequestOutput
)
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
generated_text
)
assert
generated_text
is
not
None
assert
re
.
fullmatch
(
sample_regex
,
generated_text
)
is
not
None
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_guided_json_completion
(
sample_json_schema
,
llm
):
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
max_tokens
=
1000
,
)
outputs
=
llm
.
generate
(
prompts
=
[
f
"Give an example JSON for an employee profile "
f
"that fits this schema:
{
sample_json_schema
}
"
]
*
2
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_json
=
sample_json_schema
))
assert
outputs
is
not
None
for
output
in
outputs
:
assert
output
is
not
None
assert
isinstance
(
output
,
RequestOutput
)
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
assert
generated_text
is
not
None
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
output_json
=
json
.
loads
(
generated_text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
sample_json_schema
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_guided_choice_completion
(
sample_guided_choice
,
llm
):
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
)
outputs
=
llm
.
generate
(
prompts
=
"The best language for type-safe systems programming is "
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_choice
=
sample_guided_choice
))
assert
outputs
is
not
None
for
output
in
outputs
:
assert
output
is
not
None
assert
isinstance
(
output
,
RequestOutput
)
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
generated_text
)
assert
generated_text
is
not
None
assert
generated_text
in
sample_guided_choice
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_guided_grammar
(
sample_sql_statements
,
llm
):
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
max_tokens
=
1000
,
)
outputs
=
llm
.
generate
(
prompts
=
(
"Generate a sql state that select col_1 from "
"table_1 where it is equals to 1"
),
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_grammar
=
sample_sql_statements
))
assert
outputs
is
not
None
for
output
in
outputs
:
assert
output
is
not
None
assert
isinstance
(
output
,
RequestOutput
)
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
assert
generated_text
is
not
None
# use Lark to parse the output, and make sure it's a valid parse tree
from
lark
import
Lark
parser
=
Lark
(
sample_sql_statements
)
parser
.
parse
(
generated_text
)
# remove spaces for comparison b/c we removed them in the grammar
ground_truth
=
"SELECT col_1 from table_1 where col_1 = 1"
.
replace
(
" "
,
""
)
assert
generated_text
.
strip
()
==
ground_truth
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
tests/entrypoints/openai/__init__.py
0 → 100644
View file @
e7c1b7f3
tests/entrypoints/openai/test_basic.py
0 → 100644
View file @
e7c1b7f3
from
http
import
HTTPStatus
import
openai
import
pytest
import
requests
from
vllm.version
import
__version__
as
VLLM_VERSION
from
...utils
import
RemoteOpenAIServer
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
"--max-num-seqs"
,
"128"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
(
server
):
return
server
.
get_async_client
()
@
pytest
.
mark
.
asyncio
async
def
test_show_version
(
client
:
openai
.
AsyncOpenAI
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
response
=
requests
.
get
(
base_url
+
"/version"
)
response
.
raise_for_status
()
assert
response
.
json
()
==
{
"version"
:
VLLM_VERSION
}
@
pytest
.
mark
.
asyncio
async
def
test_check_health
(
client
:
openai
.
AsyncOpenAI
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
response
=
requests
.
get
(
base_url
+
"/health"
)
assert
response
.
status_code
==
HTTPStatus
.
OK
@
pytest
.
mark
.
asyncio
async
def
test_log_metrics
(
client
:
openai
.
AsyncOpenAI
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
response
=
requests
.
get
(
base_url
+
"/metrics"
)
assert
response
.
status_code
==
HTTPStatus
.
OK
tests/entrypoints/
test_
openai
_server
.py
→
tests/entrypoints/openai
/test_chat
.py
View file @
e7c1b7f3
# imports for guided decoding tests
# imports for guided decoding tests
import
json
import
json
import
re
import
re
from
typing
import
List
import
jsonschema
import
jsonschema
import
openai
# use the official client for correctness check
import
openai
# use the official client for correctness check
import
pytest
import
pytest
# using Ray for overall ease of process management, parallel requests,
# and debugging.
import
ray
import
torch
import
torch
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
openai
import
BadRequestError
from
openai
import
BadRequestError
from
vllm.transformers_utils.tokenizer
import
get_tokeniz
er
from
...utils
import
RemoteOpenAIServ
er
from
.test_completion
import
zephyr_lora_added_tokens_files
# noqa: F401
from
.
.utils
import
VLLM_PATH
,
RemoteOpenAIServer
from
.
test_completion
import
zephyr_lora_files
# noqa: F401
# any model with a chat template should work here
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
...
@@ -23,73 +19,10 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
...
@@ -23,73 +19,10 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
# generation quality here
# generation quality here
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
TEST_SCHEMA
=
{
"type"
:
"object"
,
"properties"
:
{
"name"
:
{
"type"
:
"string"
},
"age"
:
{
"type"
:
"integer"
},
"skills"
:
{
"type"
:
"array"
,
"items"
:
{
"type"
:
"string"
,
"maxLength"
:
10
},
"minItems"
:
3
},
"work history"
:
{
"type"
:
"array"
,
"items"
:
{
"type"
:
"object"
,
"properties"
:
{
"company"
:
{
"type"
:
"string"
},
"duration"
:
{
"type"
:
"string"
},
"position"
:
{
"type"
:
"string"
}
},
"required"
:
[
"company"
,
"position"
]
}
}
},
"required"
:
[
"name"
,
"age"
,
"skills"
,
"work history"
]
}
TEST_REGEX
=
(
r
"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
r
"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
)
TEST_CHOICE
=
[
"Python"
,
"Java"
,
"JavaScript"
,
"C++"
,
"C#"
,
"PHP"
,
"TypeScript"
,
"Ruby"
,
"Swift"
,
"Kotlin"
]
pytestmark
=
pytest
.
mark
.
openai
@
pytest
.
fixture
(
scope
=
"session"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
ray_ctx
():
ray
.
init
(
runtime_env
=
{
"working_dir"
:
VLLM_PATH
})
yield
ray
.
shutdown
()
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
zephyr_lora_files
,
ray_ctx
):
def
server
(
zephyr_lora_files
,
zephyr_lora_added_tokens_files
):
# noqa: F811
return
RemoteOpenAIServer
([
args
=
[
"--model"
,
MODEL_NAME
,
# use half precision for speed and memory savings in CI environment
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"--dtype"
,
"bfloat16"
,
"bfloat16"
,
...
@@ -100,14 +33,17 @@ def server(zephyr_lora_files, ray_ctx):
...
@@ -100,14 +33,17 @@ def server(zephyr_lora_files, ray_ctx):
"--enable-lora"
,
"--enable-lora"
,
"--lora-modules"
,
"--lora-modules"
,
f
"zephyr-lora=
{
zephyr_lora_files
}
"
,
f
"zephyr-lora=
{
zephyr_lora_files
}
"
,
f
"zephyr-lora2=
{
zephyr_lora_files
}
"
,
f
"zephyr-lora2=
{
zephyr_lora_
added_tokens_
files
}
"
,
"--max-lora-rank"
,
"--max-lora-rank"
,
"64"
,
"64"
,
"--max-cpu-loras"
,
"--max-cpu-loras"
,
"2"
,
"2"
,
"--max-num-seqs"
,
"--max-num-seqs"
,
"128"
,
"128"
,
])
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
...
@@ -115,155 +51,6 @@ def client(server):
...
@@ -115,155 +51,6 @@ def client(server):
return
server
.
get_async_client
()
return
server
.
get_async_client
()
async
def
test_check_models
(
client
:
openai
.
AsyncOpenAI
):
models
=
await
client
.
models
.
list
()
models
=
models
.
data
served_model
=
models
[
0
]
lora_models
=
models
[
1
:]
assert
served_model
.
id
==
MODEL_NAME
assert
all
(
model
.
root
==
MODEL_NAME
for
model
in
models
)
assert
lora_models
[
0
].
id
==
"zephyr-lora"
assert
lora_models
[
1
].
id
==
"zephyr-lora2"
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
)
async
def
test_single_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
temperature
=
0.0
)
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
choice
=
completion
.
choices
[
0
]
assert
len
(
choice
.
text
)
>=
5
assert
choice
.
finish_reason
==
"length"
assert
completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
5
,
prompt_tokens
=
6
,
total_tokens
=
11
)
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
5
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
)
async
def
test_no_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
None
,
)
choice
=
completion
.
choices
[
0
]
assert
choice
.
logprobs
is
None
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_zero_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
0
,
)
choice
=
completion
.
choices
[
0
]
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
.
token_logprobs
is
not
None
assert
choice
.
logprobs
.
top_logprobs
is
not
None
assert
len
(
choice
.
logprobs
.
top_logprobs
[
0
])
==
1
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_some_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
5
,
)
choice
=
completion
.
choices
[
0
]
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
.
token_logprobs
is
not
None
assert
choice
.
logprobs
.
top_logprobs
is
not
None
assert
5
<=
len
(
choice
.
logprobs
.
top_logprobs
[
0
])
<=
6
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_too_many_completion_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
with
pytest
.
raises
(
(
openai
.
BadRequestError
,
openai
.
APIError
)):
# test using token IDs
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs
=
21
,
)
...
with
pytest
.
raises
(
(
openai
.
BadRequestError
,
openai
.
APIError
)):
# test using token IDs
stream
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs
=
30
,
stream
=
True
,
)
async
for
chunk
in
stream
:
...
# the server should still work afterwards
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras
# first test base model, then test loras
...
@@ -432,40 +219,6 @@ async def test_single_chat_session(client: openai.AsyncOpenAI,
...
@@ -432,40 +219,6 @@ async def test_single_chat_session(client: openai.AsyncOpenAI,
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_completion_streaming
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
prompt
=
"What is an LLM?"
single_completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
)
single_output
=
single_completion
.
choices
[
0
].
text
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
)
chunks
=
[]
finish_reason_count
=
0
async
for
chunk
in
stream
:
chunks
.
append
(
chunk
.
choices
[
0
].
text
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finish_reason_count
+=
1
# finish reason should only return in last block
assert
finish_reason_count
==
1
assert
chunk
.
choices
[
0
].
finish_reason
==
"length"
assert
chunk
.
choices
[
0
].
text
assert
""
.
join
(
chunks
)
==
single_output
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
# just test 1 lora hereafter
...
@@ -499,7 +252,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
...
@@ -499,7 +252,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
temperature
=
0.0
,
temperature
=
0.0
,
stream
=
True
,
stream
=
True
,
)
)
chunks
=
[]
chunks
:
List
[
str
]
=
[]
finish_reason_count
=
0
finish_reason_count
=
0
async
for
chunk
in
stream
:
async
for
chunk
in
stream
:
delta
=
chunk
.
choices
[
0
].
delta
delta
=
chunk
.
choices
[
0
].
delta
...
@@ -542,14 +295,19 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
...
@@ -542,14 +295,19 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
async
for
chunk
in
stream
:
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
None
assert
chunk
.
usage
is
None
# Test stream=True, stream_options={"include_usage": True}
# Test stream=True, stream_options={"include_usage": True,
stream
=
await
client
.
chat
.
completions
.
create
(
# "continuous_usage_stats": False}}
model
=
model_name
,
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
10
,
max_tokens
=
10
,
temperature
=
0.0
,
temperature
=
0.0
,
stream
=
True
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
})
stream_options
=
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
False
})
async
for
chunk
in
stream
:
async
for
chunk
in
stream
:
if
chunk
.
choices
[
0
].
finish_reason
is
None
:
if
chunk
.
choices
[
0
].
finish_reason
is
None
:
...
@@ -585,195 +343,76 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
...
@@ -585,195 +343,76 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
stream
=
False
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
True
})
stream_options
=
{
"include_usage"
:
True
})
# Test stream=True, stream_options={"include_usage": True,
@
pytest
.
mark
.
asyncio
# "continuous_usage_stats": True}
@
pytest
.
mark
.
parametrize
(
stream
=
await
client
.
chat
.
completions
.
create
(
"model_name"
,
[
"HuggingFaceH4/zephyr-7b-beta"
,
"zephyr-lora"
],
)
async
def
test_completion_stream_options
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
prompt
=
"What is the capital of France?"
# Test stream=True, stream_options={"include_usage": False}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
False
})
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
None
# Test stream=True, stream_options={"include_usage": True}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
})
async
for
chunk
in
stream
:
if
chunk
.
choices
[
0
].
finish_reason
is
None
:
assert
chunk
.
usage
is
None
else
:
assert
chunk
.
usage
is
None
final_chunk
=
await
stream
.
__anext__
()
assert
final_chunk
.
usage
is
not
None
assert
final_chunk
.
usage
.
prompt_tokens
>
0
assert
final_chunk
.
usage
.
completion_tokens
>
0
assert
final_chunk
.
usage
.
total_tokens
==
(
final_chunk
.
usage
.
prompt_tokens
+
final_chunk
.
usage
.
completion_tokens
)
assert
final_chunk
.
choices
==
[]
# Test stream=False, stream_options={"include_usage": None}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
None
})
# Test stream=False, stream_options={"include_usage": True}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
True
})
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_batch_completions
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test simple list
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
"Hello, my name is"
,
"Hello, my name is"
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
batch
.
choices
)
==
2
assert
batch
.
choices
[
0
].
text
==
batch
.
choices
[
1
].
text
# test n = 2
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
"Hello, my name is"
,
"Hello, my name is"
],
n
=
2
,
max_tokens
=
5
,
temperature
=
0.0
,
extra_body
=
dict
(
# NOTE: this has to be true for n > 1 in vLLM, but not necessary
# for official client.
use_beam_search
=
True
),
)
assert
len
(
batch
.
choices
)
==
4
assert
batch
.
choices
[
0
].
text
!=
batch
.
choices
[
1
].
text
,
"beam search should be different"
assert
batch
.
choices
[
0
].
text
==
batch
.
choices
[
2
].
text
,
"two copies of the same prompt should be the same"
assert
batch
.
choices
[
1
].
text
==
batch
.
choices
[
3
].
text
,
"two copies of the same prompt should be the same"
# test streaming
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
model
=
model_name
,
prompt
=
[
"Hello, my name is"
,
"Hello, my name is"
]
,
messages
=
messages
,
max_tokens
=
5
,
max_tokens
=
10
,
temperature
=
0.0
,
temperature
=
0.0
,
stream
=
True
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
True
},
)
)
texts
=
[
""
]
*
2
async
for
chunk
in
stream
:
async
for
chunk
in
batch
:
assert
chunk
.
usage
.
prompt_tokens
>=
0
assert
len
(
chunk
.
choices
)
==
1
assert
chunk
.
usage
.
completion_tokens
>=
0
choice
=
chunk
.
choices
[
0
]
assert
chunk
.
usage
.
total_tokens
==
(
chunk
.
usage
.
prompt_tokens
+
texts
[
choice
.
index
]
+=
choice
.
text
chunk
.
usage
.
completion_tokens
)
assert
texts
[
0
]
==
texts
[
1
]
@
pytest
.
mark
.
asyncio
async
def
test_logits_bias
(
client
:
openai
.
AsyncOpenAI
):
prompt
=
"Hello, my name is"
max_tokens
=
5
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# Test exclusive selection
token_id
=
1000
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
logit_bias
=
{
str
(
token_id
):
100
},
seed
=
42
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
5
response_tokens
=
tokenizer
(
completion
.
choices
[
0
].
text
,
add_special_tokens
=
False
)[
"input_ids"
]
expected_tokens
=
tokenizer
(
tokenizer
.
decode
([
token_id
]
*
5
),
add_special_tokens
=
False
)[
"input_ids"
]
assert
all
([
response
==
expected
for
response
,
expected
in
zip
(
response_tokens
,
expected_tokens
)
])
# Test ban
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
)
response_tokens
=
tokenizer
(
completion
.
choices
[
0
].
text
,
add_special_tokens
=
False
)[
"input_ids"
]
first_response
=
completion
.
choices
[
0
].
text
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
logit_bias
=
{
str
(
token
):
-
100
for
token
in
response_tokens
},
)
assert
first_response
!=
completion
.
choices
[
0
].
text
# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
# (i.e. using the same ordering as in the Completions API tests), the test
# will fail on the second `guided_decoding_backend` even when I swap their order
# (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_json_completion
(
client
:
openai
.
AsyncOpenAI
,
async
def
test_guided_choice_chat
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
guided_decoding_backend
:
str
,
completion
=
await
client
.
completions
.
create
(
sample_guided_choice
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"The best language for type-safe systems programming is "
}]
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
prompt
=
f
"Give an example JSON for an employee profile "
messages
=
messages
,
f
"that fits this schema:
{
TEST_SCHEMA
}
"
,
max_tokens
=
10
,
n
=
3
,
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
,
temperature
=
1.0
,
max_tokens
=
500
,
extra_body
=
dict
(
guided_json
=
TEST_SCHEMA
,
guided_decoding_backend
=
guided_decoding_backend
))
guided_decoding_backend
=
guided_decoding_backend
))
choice1
=
chat_completion
.
choices
[
0
].
message
.
content
assert
choice1
in
sample_guided_choice
assert
completion
.
id
is
not
None
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
choice1
})
assert
len
(
completion
.
choices
)
==
3
messages
.
append
({
for
i
in
range
(
3
):
"role"
:
"user"
,
output_json
=
json
.
loads
(
completion
.
choices
[
i
].
text
)
"content"
:
"I disagree, pick another one"
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
TEST_SCHEMA
)
})
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
10
,
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
,
guided_decoding_backend
=
guided_decoding_backend
))
choice2
=
chat_completion
.
choices
[
0
].
message
.
content
assert
choice2
in
sample_guided_choice
assert
choice1
!=
choice2
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_json_chat
(
client
:
openai
.
AsyncOpenAI
,
async
def
test_guided_json_chat
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
guided_decoding_backend
:
str
,
sample_json_schema
):
messages
=
[{
messages
=
[{
"role"
:
"system"
,
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
"content"
:
"you are a helpful assistant"
...
@@ -782,18 +421,18 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
...
@@ -782,18 +421,18 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
"user"
,
"user"
,
"content"
:
"content"
:
f
"Give an example JSON for an employee profile that "
f
"Give an example JSON for an employee profile that "
f
"fits this schema:
{
TEST_SCHEMA
}
"
f
"fits this schema:
{
sample_json_schema
}
"
}]
}]
chat_completion
=
await
client
.
chat
.
completions
.
create
(
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
1000
,
max_tokens
=
1000
,
extra_body
=
dict
(
guided_json
=
TEST_SCHEMA
,
extra_body
=
dict
(
guided_json
=
sample_json_schema
,
guided_decoding_backend
=
guided_decoding_backend
))
guided_decoding_backend
=
guided_decoding_backend
))
message
=
chat_completion
.
choices
[
0
].
message
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
assert
message
.
content
is
not
None
json1
=
json
.
loads
(
message
.
content
)
json1
=
json
.
loads
(
message
.
content
)
jsonschema
.
validate
(
instance
=
json1
,
schema
=
TEST_SCHEMA
)
jsonschema
.
validate
(
instance
=
json1
,
schema
=
sample_json_schema
)
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
message
.
content
})
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
message
.
content
})
messages
.
append
({
messages
.
append
({
...
@@ -806,41 +445,21 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
...
@@ -806,41 +445,21 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
1000
,
max_tokens
=
1000
,
extra_body
=
dict
(
guided_json
=
TEST_SCHEMA
,
extra_body
=
dict
(
guided_json
=
sample_json_schema
,
guided_decoding_backend
=
guided_decoding_backend
))
guided_decoding_backend
=
guided_decoding_backend
))
message
=
chat_completion
.
choices
[
0
].
message
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
assert
message
.
content
is
not
None
json2
=
json
.
loads
(
message
.
content
)
json2
=
json
.
loads
(
message
.
content
)
jsonschema
.
validate
(
instance
=
json2
,
schema
=
TEST_SCHEMA
)
jsonschema
.
validate
(
instance
=
json2
,
schema
=
sample_json_schema
)
assert
json1
[
"name"
]
!=
json2
[
"name"
]
assert
json1
[
"name"
]
!=
json2
[
"name"
]
assert
json1
[
"age"
]
!=
json2
[
"age"
]
assert
json1
[
"age"
]
!=
json2
[
"age"
]
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_regex_completion
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
f
"Give an example IPv4 address with this regex:
{
TEST_REGEX
}
"
,
n
=
3
,
temperature
=
1.0
,
max_tokens
=
20
,
extra_body
=
dict
(
guided_regex
=
TEST_REGEX
,
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
len
(
completion
.
choices
)
==
3
for
i
in
range
(
3
):
assert
re
.
fullmatch
(
TEST_REGEX
,
completion
.
choices
[
i
].
text
)
is
not
None
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_regex_chat
(
client
:
openai
.
AsyncOpenAI
,
async
def
test_guided_regex_chat
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
guided_decoding_backend
:
str
,
sample_regex
):
messages
=
[{
messages
=
[{
"role"
:
"system"
,
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
"content"
:
"you are a helpful assistant"
...
@@ -848,17 +467,17 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
...
@@ -848,17 +467,17 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
"role"
:
"role"
:
"user"
,
"user"
,
"content"
:
"content"
:
f
"Give an example IP address with this regex:
{
TEST_REGEX
}
"
f
"Give an example IP address with this regex:
{
sample_regex
}
"
}]
}]
chat_completion
=
await
client
.
chat
.
completions
.
create
(
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
20
,
max_tokens
=
20
,
extra_body
=
dict
(
guided_regex
=
TEST_REGEX
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
guided_decoding_backend
=
guided_decoding_backend
))
guided_decoding_backend
=
guided_decoding_backend
))
ip1
=
chat_completion
.
choices
[
0
].
message
.
content
ip1
=
chat_completion
.
choices
[
0
].
message
.
content
assert
ip1
is
not
None
assert
ip1
is
not
None
assert
re
.
fullmatch
(
TEST_REGEX
,
ip1
)
is
not
None
assert
re
.
fullmatch
(
sample_regex
,
ip1
)
is
not
None
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
ip1
})
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
ip1
})
messages
.
append
({
"role"
:
"user"
,
"content"
:
"Give me a different one"
})
messages
.
append
({
"role"
:
"user"
,
"content"
:
"Give me a different one"
})
...
@@ -866,39 +485,16 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
...
@@ -866,39 +485,16 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
20
,
max_tokens
=
20
,
extra_body
=
dict
(
guided_regex
=
TEST_REGEX
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
guided_decoding_backend
=
guided_decoding_backend
))
guided_decoding_backend
=
guided_decoding_backend
))
ip2
=
chat_completion
.
choices
[
0
].
message
.
content
ip2
=
chat_completion
.
choices
[
0
].
message
.
content
assert
ip2
is
not
None
assert
ip2
is
not
None
assert
re
.
fullmatch
(
TEST_REGEX
,
ip2
)
is
not
None
assert
re
.
fullmatch
(
sample_regex
,
ip2
)
is
not
None
assert
ip1
!=
ip2
assert
ip1
!=
ip2
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
async
def
test_guided_decoding_type_error
(
client
:
openai
.
AsyncOpenAI
):
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_choice_completion
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"The best language for type-safe systems programming is "
,
n
=
2
,
temperature
=
1.0
,
max_tokens
=
10
,
extra_body
=
dict
(
guided_choice
=
TEST_CHOICE
,
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
len
(
completion
.
choices
)
==
2
for
i
in
range
(
2
):
assert
completion
.
choices
[
i
].
text
in
TEST_CHOICE
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_choice_chat
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
messages
=
[{
messages
=
[{
"role"
:
"system"
,
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
"content"
:
"you are a helpful assistant"
...
@@ -908,52 +504,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
...
@@ -908,52 +504,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
"content"
:
"content"
:
"The best language for type-safe systems programming is "
"The best language for type-safe systems programming is "
}]
}]
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
10
,
extra_body
=
dict
(
guided_choice
=
TEST_CHOICE
,
guided_decoding_backend
=
guided_decoding_backend
))
choice1
=
chat_completion
.
choices
[
0
].
message
.
content
assert
choice1
in
TEST_CHOICE
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
choice1
})
messages
.
append
({
"role"
:
"user"
,
"content"
:
"I disagree, pick another one"
})
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
10
,
extra_body
=
dict
(
guided_choice
=
TEST_CHOICE
,
guided_decoding_backend
=
guided_decoding_backend
))
choice2
=
chat_completion
.
choices
[
0
].
message
.
content
assert
choice2
in
TEST_CHOICE
assert
choice1
!=
choice2
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_decoding_type_error
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
with
pytest
.
raises
(
openai
.
BadRequestError
):
_
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Give an example JSON that fits this schema: 42"
,
extra_body
=
dict
(
guided_json
=
42
,
guided_decoding_backend
=
guided_decoding_backend
))
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"The best language for type-safe systems programming is "
}]
with
pytest
.
raises
(
openai
.
BadRequestError
):
with
pytest
.
raises
(
openai
.
BadRequestError
):
_
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
_
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
messages
=
messages
,
...
@@ -962,18 +513,13 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
...
@@ -962,18 +513,13 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
2
:
"C++"
2
:
"C++"
}))
}))
with
pytest
.
raises
(
openai
.
BadRequestError
):
_
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Give an example string that fits this regex"
,
extra_body
=
dict
(
guided_regex
=
TEST_REGEX
,
guided_json
=
TEST_SCHEMA
))
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_choice_chat_logprobs
(
client
:
openai
.
AsyncOpenAI
,
async
def
test_guided_choice_chat_logprobs
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
guided_decoding_backend
:
str
,
sample_guided_choice
):
messages
=
[{
messages
=
[{
"role"
:
"system"
,
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
"content"
:
"you are a helpful assistant"
...
@@ -989,7 +535,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
...
@@ -989,7 +535,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
max_tokens
=
10
,
max_tokens
=
10
,
logprobs
=
True
,
logprobs
=
True
,
top_logprobs
=
5
,
top_logprobs
=
5
,
extra_body
=
dict
(
guided_choice
=
TEST_CHOICE
,
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
,
guided_decoding_backend
=
guided_decoding_backend
))
guided_decoding_backend
=
guided_decoding_backend
))
assert
chat_completion
.
choices
[
0
].
logprobs
is
not
None
assert
chat_completion
.
choices
[
0
].
logprobs
is
not
None
...
@@ -1005,7 +551,8 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
...
@@ -1005,7 +551,8 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_named_tool_use
(
client
:
openai
.
AsyncOpenAI
,
async
def
test_named_tool_use
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
guided_decoding_backend
:
str
,
sample_json_schema
):
messages
=
[{
messages
=
[{
"role"
:
"system"
,
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
"content"
:
"you are a helpful assistant"
...
@@ -1014,7 +561,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
...
@@ -1014,7 +561,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
"user"
,
"user"
,
"content"
:
"content"
:
f
"Give an example JSON for an employee profile that "
f
"Give an example JSON for an employee profile that "
f
"fits this schema:
{
TEST_SCHEMA
}
"
f
"fits this schema:
{
sample_json_schema
}
"
}]
}]
# non-streaming
# non-streaming
...
@@ -1028,7 +575,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
...
@@ -1028,7 +575,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
"function"
:
{
"function"
:
{
"name"
:
"dummy_function_name"
,
"name"
:
"dummy_function_name"
,
"description"
:
"This is a dummy function"
,
"description"
:
"This is a dummy function"
,
"parameters"
:
TEST_SCHEMA
"parameters"
:
sample_json_schema
}
}
}],
}],
tool_choice
=
{
tool_choice
=
{
...
@@ -1041,7 +588,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
...
@@ -1041,7 +588,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
assert
len
(
message
.
content
)
==
0
assert
len
(
message
.
content
)
==
0
json_string
=
message
.
tool_calls
[
0
].
function
.
arguments
json_string
=
message
.
tool_calls
[
0
].
function
.
arguments
json1
=
json
.
loads
(
json_string
)
json1
=
json
.
loads
(
json_string
)
jsonschema
.
validate
(
instance
=
json1
,
schema
=
TEST_SCHEMA
)
jsonschema
.
validate
(
instance
=
json1
,
schema
=
sample_json_schema
)
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
json_string
})
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
json_string
})
messages
.
append
({
messages
.
append
({
...
@@ -1062,7 +609,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
...
@@ -1062,7 +609,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
"function"
:
{
"function"
:
{
"name"
:
"dummy_function_name"
,
"name"
:
"dummy_function_name"
,
"description"
:
"This is a dummy function"
,
"description"
:
"This is a dummy function"
,
"parameters"
:
TEST_SCHEMA
"parameters"
:
sample_json_schema
}
}
}],
}],
tool_choice
=
{
tool_choice
=
{
...
@@ -1087,7 +634,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
...
@@ -1087,7 +634,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
# finish reason should only return in last block
# finish reason should only return in last block
assert
finish_reason_count
==
1
assert
finish_reason_count
==
1
json2
=
json
.
loads
(
""
.
join
(
output
))
json2
=
json
.
loads
(
""
.
join
(
output
))
jsonschema
.
validate
(
instance
=
json2
,
schema
=
TEST_SCHEMA
)
jsonschema
.
validate
(
instance
=
json2
,
schema
=
sample_json_schema
)
assert
json1
[
"name"
]
!=
json2
[
"name"
]
assert
json1
[
"name"
]
!=
json2
[
"name"
]
assert
json1
[
"age"
]
!=
json2
[
"age"
]
assert
json1
[
"age"
]
!=
json2
[
"age"
]
...
@@ -1095,7 +642,8 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
...
@@ -1095,7 +642,8 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
])
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
])
async
def
test_required_tool_use_not_yet_supported
(
async
def
test_required_tool_use_not_yet_supported
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_json_schema
):
messages
=
[{
messages
=
[{
"role"
:
"system"
,
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
"content"
:
"you are a helpful assistant"
...
@@ -1104,7 +652,7 @@ async def test_required_tool_use_not_yet_supported(
...
@@ -1104,7 +652,7 @@ async def test_required_tool_use_not_yet_supported(
"user"
,
"user"
,
"content"
:
"content"
:
f
"Give an example JSON for an employee profile that "
f
"Give an example JSON for an employee profile that "
f
"fits this schema:
{
TEST_SCHEMA
}
"
f
"fits this schema:
{
sample_json_schema
}
"
}]
}]
with
pytest
.
raises
(
openai
.
BadRequestError
):
with
pytest
.
raises
(
openai
.
BadRequestError
):
...
@@ -1117,7 +665,7 @@ async def test_required_tool_use_not_yet_supported(
...
@@ -1117,7 +665,7 @@ async def test_required_tool_use_not_yet_supported(
"function"
:
{
"function"
:
{
"name"
:
"dummy_function_name"
,
"name"
:
"dummy_function_name"
,
"description"
:
"This is a dummy function"
,
"description"
:
"This is a dummy function"
,
"parameters"
:
TEST_SCHEMA
"parameters"
:
sample_json_schema
}
}
}],
}],
tool_choice
=
"required"
)
tool_choice
=
"required"
)
...
@@ -1132,7 +680,7 @@ async def test_required_tool_use_not_yet_supported(
...
@@ -1132,7 +680,7 @@ async def test_required_tool_use_not_yet_supported(
"function"
:
{
"function"
:
{
"name"
:
"dummy_function_name"
,
"name"
:
"dummy_function_name"
,
"description"
:
"This is a dummy function"
,
"description"
:
"This is a dummy function"
,
"parameters"
:
TEST_SCHEMA
"parameters"
:
sample_json_schema
}
}
}],
}],
tool_choice
=
"auto"
)
tool_choice
=
"auto"
)
...
@@ -1140,8 +688,9 @@ async def test_required_tool_use_not_yet_supported(
...
@@ -1140,8 +688,9 @@ async def test_required_tool_use_not_yet_supported(
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
])
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
])
async
def
test_inconsistent_tool_choice_and_tools
(
async
def
test_inconsistent_tool_choice_and_tools
(
client
:
openai
.
AsyncOpenAI
,
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
guided_decoding_backend
:
str
,
sample_json_schema
):
messages
=
[{
messages
=
[{
"role"
:
"system"
,
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
"content"
:
"you are a helpful assistant"
...
@@ -1150,7 +699,7 @@ async def test_inconsistent_tool_choice_and_tools(
...
@@ -1150,7 +699,7 @@ async def test_inconsistent_tool_choice_and_tools(
"user"
,
"user"
,
"content"
:
"content"
:
f
"Give an example JSON for an employee profile that "
f
"Give an example JSON for an employee profile that "
f
"fits this schema:
{
TEST_SCHEMA
}
"
f
"fits this schema:
{
sample_json_schema
}
"
}]
}]
with
pytest
.
raises
(
openai
.
BadRequestError
):
with
pytest
.
raises
(
openai
.
BadRequestError
):
...
@@ -1175,7 +724,7 @@ async def test_inconsistent_tool_choice_and_tools(
...
@@ -1175,7 +724,7 @@ async def test_inconsistent_tool_choice_and_tools(
"function"
:
{
"function"
:
{
"name"
:
"dummy_function_name"
,
"name"
:
"dummy_function_name"
,
"description"
:
"This is a dummy function"
,
"description"
:
"This is a dummy function"
,
"parameters"
:
TEST_SCHEMA
"parameters"
:
sample_json_schema
}
}
}],
}],
tool_choice
=
{
tool_choice
=
{
...
@@ -1273,76 +822,6 @@ async def test_custom_role(client: openai.AsyncOpenAI):
...
@@ -1273,76 +822,6 @@ async def test_custom_role(client: openai.AsyncOpenAI):
assert
content1
==
content2
assert
content1
==
content2
@
pytest
.
mark
.
asyncio
async
def
test_guided_grammar
(
client
:
openai
.
AsyncOpenAI
):
simple_sql_grammar
=
"""
start: select_statement
select_statement: "SELECT" column "from" table "where" condition
column: "col_1" | "col_2"
table: "table_1" | "table_2"
condition: column "=" number
number: "1" | "2"
"""
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
(
"Generate a sql state that select col_1 from "
"table_1 where it is equals to 1"
),
temperature
=
1.0
,
max_tokens
=
500
,
extra_body
=
dict
(
guided_grammar
=
simple_sql_grammar
))
content
=
completion
.
choices
[
0
].
text
# use Lark to parse the output, and make sure it's a valid parse tree
from
lark
import
Lark
parser
=
Lark
(
simple_sql_grammar
)
parser
.
parse
(
content
)
# remove spaces for comparison b/c we removed them in the grammar
ground_truth
=
"SELECT col_1 from table_1 where col_1 = 1"
.
replace
(
" "
,
""
)
assert
content
.
strip
()
==
ground_truth
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
)
@
pytest
.
mark
.
parametrize
(
"logprobs_arg"
,
[
1
,
0
])
async
def
test_echo_logprob_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
logprobs_arg
:
int
):
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# test using text and token IDs
for
prompt
in
(
"Hello, my name is"
,
[
0
,
0
,
0
,
0
,
0
]):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
echo
=
True
,
logprobs
=
logprobs_arg
)
prompt_text
=
tokenizer
.
decode
(
prompt
)
if
isinstance
(
prompt
,
list
)
else
prompt
assert
re
.
search
(
r
"^"
+
prompt_text
,
completion
.
choices
[
0
].
text
)
logprobs
=
completion
.
choices
[
0
].
logprobs
assert
logprobs
is
not
None
assert
len
(
logprobs
.
text_offset
)
>
5
assert
(
len
(
logprobs
.
token_logprobs
)
>
5
and
logprobs
.
token_logprobs
[
0
]
is
None
)
assert
(
len
(
logprobs
.
top_logprobs
)
>
5
and
logprobs
.
top_logprobs
[
0
]
is
None
)
for
top_logprobs
in
logprobs
.
top_logprobs
[
1
:]:
assert
max
(
logprobs_arg
,
1
)
<=
len
(
top_logprobs
)
<=
logprobs_arg
+
1
assert
len
(
logprobs
.
tokens
)
>
5
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_long_seed
(
client
:
openai
.
AsyncOpenAI
):
async
def
test_long_seed
(
client
:
openai
.
AsyncOpenAI
):
for
seed
in
[
for
seed
in
[
...
@@ -1361,7 +840,3 @@ async def test_long_seed(client: openai.AsyncOpenAI):
...
@@ -1361,7 +840,3 @@ async def test_long_seed(client: openai.AsyncOpenAI):
assert
(
"greater_than_equal"
in
exc_info
.
value
.
message
assert
(
"greater_than_equal"
in
exc_info
.
value
.
message
or
"less_than_equal"
in
exc_info
.
value
.
message
)
or
"less_than_equal"
in
exc_info
.
value
.
message
)
if
__name__
==
"__main__"
:
pytest
.
main
([
__file__
])
tests/entrypoints/openai/test_completion.py
0 → 100644
View file @
e7c1b7f3
# imports for guided decoding tests
import
json
import
re
import
shutil
from
tempfile
import
TemporaryDirectory
from
typing
import
List
import
jsonschema
import
openai
# use the official client for correctness check
import
pytest
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
openai
import
BadRequestError
from
transformers
import
AutoTokenizer
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
...utils
import
RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
# technically these adapters use a different base model,
# but we're not testing generation quality here
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
PA_NAME
=
"swapnilbp/llama_tweet_ptune"
# if PA_NAME changes, PA_NUM_VIRTUAL_TOKENS might also
# need to change to match the prompt adapter
PA_NUM_VIRTUAL_TOKENS
=
8
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_added_tokens_files
(
zephyr_lora_files
):
tmp_dir
=
TemporaryDirectory
()
tmp_model_dir
=
f
"
{
tmp_dir
.
name
}
/zephyr"
shutil
.
copytree
(
zephyr_lora_files
,
tmp_model_dir
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_NAME
)
# Copy tokenizer to adapter and add some unique tokens
# 32000, 32001, 32002
added
=
tokenizer
.
add_tokens
([
"vllm1"
,
"vllm2"
,
"vllm3"
],
special_tokens
=
True
)
assert
added
==
3
tokenizer
.
save_pretrained
(
tmp_model_dir
)
yield
tmp_model_dir
tmp_dir
.
cleanup
()
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_pa_files
():
return
snapshot_download
(
repo_id
=
PA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
default_server_args
(
zephyr_lora_files
,
zephyr_lora_added_tokens_files
,
zephyr_pa_files
):
return
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--max-num-seqs"
,
"128"
,
"--enforce-eager"
,
# lora config
"--enable-lora"
,
"--lora-modules"
,
f
"zephyr-lora=
{
zephyr_lora_files
}
"
,
f
"zephyr-lora2=
{
zephyr_lora_added_tokens_files
}
"
,
"--max-lora-rank"
,
"64"
,
"--max-cpu-loras"
,
"2"
,
# pa config
"--enable-prompt-adapter"
,
"--prompt-adapters"
,
f
"zephyr-pa=
{
zephyr_pa_files
}
"
,
f
"zephyr-pa2=
{
zephyr_pa_files
}
"
,
"--max-prompt-adapters"
,
"2"
,
"--max-prompt-adapter-token"
,
"128"
,
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
default_server_args
):
with
RemoteOpenAIServer
(
MODEL_NAME
,
default_server_args
)
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
(
server
):
return
server
.
get_async_client
()
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras, then test prompt adapters
"model_name,num_virtual_tokens"
,
[(
MODEL_NAME
,
0
),
(
"zephyr-lora"
,
0
),
(
"zephyr-lora2"
,
0
),
(
"zephyr-pa"
,
PA_NUM_VIRTUAL_TOKENS
),
(
"zephyr-pa2"
,
PA_NUM_VIRTUAL_TOKENS
)],
)
async
def
test_single_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
num_virtual_tokens
:
int
):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
temperature
=
0.0
)
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
choice
=
completion
.
choices
[
0
]
assert
len
(
choice
.
text
)
>=
5
assert
choice
.
finish_reason
==
"length"
assert
completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
5
,
prompt_tokens
=
6
+
num_virtual_tokens
,
total_tokens
=
11
+
num_virtual_tokens
)
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
1
@
pytest
.
mark
.
asyncio
async
def
test_added_lora_tokens
(
client
:
openai
.
AsyncOpenAI
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
"zephyr-lora2"
,
prompt
=
[
0
,
0
,
32000
,
32001
,
32002
],
echo
=
True
,
max_tokens
=
5
,
temperature
=
0.0
,
)
# Added tokens should appear in tokenized prompt
assert
completion
.
choices
[
0
].
text
.
startswith
(
"<unk><unk>vllm1vllm2vllm3"
)
@
pytest
.
mark
.
asyncio
async
def
test_added_lora_tokens_base_model
(
client
:
openai
.
AsyncOpenAI
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
32000
,
32001
,
32002
],
echo
=
True
,
max_tokens
=
5
,
temperature
=
0.0
,
)
# Added tokens should not appear in tokenized prompt
assert
"vllm"
not
in
completion
.
choices
[
0
].
text
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras, then test prompt adapters
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
,
"zephyr-pa"
,
"zephyr-pa2"
],
)
async
def
test_no_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
None
,
)
choice
=
completion
.
choices
[
0
]
assert
choice
.
logprobs
is
None
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# just test 1 lora and 1 pa hereafter
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_zero_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
0
,
)
choice
=
completion
.
choices
[
0
]
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
.
token_logprobs
is
not
None
assert
choice
.
logprobs
.
top_logprobs
is
not
None
assert
len
(
choice
.
logprobs
.
top_logprobs
[
0
])
==
1
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_some_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
5
,
)
choice
=
completion
.
choices
[
0
]
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
.
token_logprobs
is
not
None
assert
choice
.
logprobs
.
top_logprobs
is
not
None
assert
5
<=
len
(
choice
.
logprobs
.
top_logprobs
[
0
])
<=
6
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_too_many_completion_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
with
pytest
.
raises
(
(
openai
.
BadRequestError
,
openai
.
APIError
)):
# test using token IDs
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs
=
21
,
)
...
with
pytest
.
raises
(
(
openai
.
BadRequestError
,
openai
.
APIError
)):
# test using token IDs
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs
=
30
,
stream
=
True
,
)
async
for
chunk
in
stream
:
...
# the server should still work afterwards
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_completion_streaming
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
prompt
=
"What is an LLM?"
single_completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
)
single_output
=
single_completion
.
choices
[
0
].
text
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
)
chunks
:
List
[
str
]
=
[]
finish_reason_count
=
0
async
for
chunk
in
stream
:
chunks
.
append
(
chunk
.
choices
[
0
].
text
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finish_reason_count
+=
1
# finish reason should only return in last block
assert
finish_reason_count
==
1
assert
chunk
.
choices
[
0
].
finish_reason
==
"length"
assert
chunk
.
choices
[
0
].
text
assert
""
.
join
(
chunks
)
==
single_output
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_completion_stream_options
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
prompt
=
"What is the capital of France?"
# Test stream=True, stream_options=
# {"include_usage": False, "continuous_usage_stats": False}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
False
,
"continuous_usage_stats"
:
False
,
})
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
None
# Test stream=True, stream_options=
# {"include_usage": False, "continuous_usage_stats": True}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
False
,
"continuous_usage_stats"
:
True
,
})
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
None
# Test stream=True, stream_options=
# {"include_usage": True, "continuous_usage_stats": False}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
False
,
})
async
for
chunk
in
stream
:
if
chunk
.
choices
[
0
].
finish_reason
is
None
:
assert
chunk
.
usage
is
None
else
:
assert
chunk
.
usage
is
None
final_chunk
=
await
stream
.
__anext__
()
assert
final_chunk
.
usage
is
not
None
assert
final_chunk
.
usage
.
prompt_tokens
>
0
assert
final_chunk
.
usage
.
completion_tokens
>
0
assert
final_chunk
.
usage
.
total_tokens
==
(
final_chunk
.
usage
.
prompt_tokens
+
final_chunk
.
usage
.
completion_tokens
)
assert
final_chunk
.
choices
==
[]
# Test stream=True, stream_options=
# {"include_usage": True, "continuous_usage_stats": True}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
True
,
})
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
not
None
assert
chunk
.
usage
.
prompt_tokens
>
0
assert
chunk
.
usage
.
completion_tokens
>
0
assert
chunk
.
usage
.
total_tokens
==
(
chunk
.
usage
.
prompt_tokens
+
chunk
.
usage
.
completion_tokens
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
final_chunk
=
await
stream
.
__anext__
()
assert
final_chunk
.
usage
is
not
None
assert
final_chunk
.
usage
.
prompt_tokens
>
0
assert
final_chunk
.
usage
.
completion_tokens
>
0
assert
final_chunk
.
usage
.
total_tokens
==
(
final_chunk
.
usage
.
prompt_tokens
+
final_chunk
.
usage
.
completion_tokens
)
assert
final_chunk
.
choices
==
[]
# Test stream=False, stream_options=
# {"include_usage": None}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
None
})
# Test stream=False, stream_options=
# {"include_usage": True}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
True
})
# Test stream=False, stream_options=
# {"continuous_usage_stats": None}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"continuous_usage_stats"
:
None
})
# Test stream=False, stream_options=
# {"continuous_usage_stats": True}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"continuous_usage_stats"
:
True
})
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_batch_completions
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test both text and token IDs
for
prompts
in
([
"Hello, my name is"
]
*
2
,
[[
0
,
0
,
0
,
0
,
0
]]
*
2
):
# test simple list
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompts
,
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
batch
.
choices
)
==
2
assert
batch
.
choices
[
0
].
text
==
batch
.
choices
[
1
].
text
# test n = 2
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompts
,
n
=
2
,
max_tokens
=
5
,
temperature
=
0.0
,
extra_body
=
dict
(
# NOTE: this has to be true for n > 1 in vLLM, but not necessary
# for official client.
use_beam_search
=
True
),
)
assert
len
(
batch
.
choices
)
==
4
assert
batch
.
choices
[
0
].
text
!=
batch
.
choices
[
1
].
text
,
"beam search should be different"
assert
batch
.
choices
[
0
].
text
==
batch
.
choices
[
2
].
text
,
"two copies of the same prompt should be the same"
assert
batch
.
choices
[
1
].
text
==
batch
.
choices
[
3
].
text
,
"two copies of the same prompt should be the same"
# test streaming
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompts
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
)
texts
=
[
""
]
*
2
async
for
chunk
in
batch
:
assert
len
(
chunk
.
choices
)
==
1
choice
=
chunk
.
choices
[
0
]
texts
[
choice
.
index
]
+=
choice
.
text
assert
texts
[
0
]
==
texts
[
1
]
@
pytest
.
mark
.
asyncio
async
def
test_logits_bias
(
client
:
openai
.
AsyncOpenAI
):
prompt
=
"Hello, my name is"
max_tokens
=
5
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# Test exclusive selection
token_id
=
1000
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
logit_bias
=
{
str
(
token_id
):
100
},
seed
=
42
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
5
response_tokens
=
tokenizer
(
completion
.
choices
[
0
].
text
,
add_special_tokens
=
False
)[
"input_ids"
]
expected_tokens
=
tokenizer
(
tokenizer
.
decode
([
token_id
]
*
5
),
add_special_tokens
=
False
)[
"input_ids"
]
assert
all
([
response
==
expected
for
response
,
expected
in
zip
(
response_tokens
,
expected_tokens
)
])
# Test ban
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
)
response_tokens
=
tokenizer
(
completion
.
choices
[
0
].
text
,
add_special_tokens
=
False
)[
"input_ids"
]
first_response
=
completion
.
choices
[
0
].
text
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
logit_bias
=
{
str
(
token
):
-
100
for
token
in
response_tokens
},
)
assert
first_response
!=
completion
.
choices
[
0
].
text
@
pytest
.
mark
.
asyncio
async
def
test_allowed_token_ids
(
client
:
openai
.
AsyncOpenAI
):
prompt
=
"Hello, my name is"
max_tokens
=
1
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# Test exclusive selection
allowed_ids
=
[
21555
,
21557
,
21558
]
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
seed
=
42
,
extra_body
=
dict
(
allowed_token_ids
=
allowed_ids
),
logprobs
=
1
,
)
response_tokens
=
completion
.
choices
[
0
].
logprobs
.
tokens
assert
len
(
response_tokens
)
==
1
assert
tokenizer
.
convert_tokens_to_ids
(
response_tokens
)[
0
]
in
allowed_ids
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_json_completion
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_json_schema
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
f
"Give an example JSON for an employee profile "
f
"that fits this schema:
{
sample_json_schema
}
"
,
n
=
3
,
temperature
=
1.0
,
max_tokens
=
500
,
extra_body
=
dict
(
guided_json
=
sample_json_schema
,
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
len
(
completion
.
choices
)
==
3
for
i
in
range
(
3
):
output_json
=
json
.
loads
(
completion
.
choices
[
i
].
text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
sample_json_schema
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_regex_completion
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_regex
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
f
"Give an example IPv4 address with this regex:
{
sample_regex
}
"
,
n
=
3
,
temperature
=
1.0
,
max_tokens
=
20
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
len
(
completion
.
choices
)
==
3
for
i
in
range
(
3
):
assert
re
.
fullmatch
(
sample_regex
,
completion
.
choices
[
i
].
text
)
is
not
None
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_choice_completion
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_guided_choice
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"The best language for type-safe systems programming is "
,
n
=
2
,
temperature
=
1.0
,
max_tokens
=
10
,
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
,
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
len
(
completion
.
choices
)
==
2
for
i
in
range
(
2
):
assert
completion
.
choices
[
i
].
text
in
sample_guided_choice
@
pytest
.
mark
.
asyncio
async
def
test_guided_grammar
(
client
:
openai
.
AsyncOpenAI
,
sample_sql_statements
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
(
"Generate a sql state that select col_1 from "
"table_1 where it is equals to 1"
),
temperature
=
1.0
,
max_tokens
=
500
,
extra_body
=
dict
(
guided_grammar
=
sample_sql_statements
))
content
=
completion
.
choices
[
0
].
text
# use Lark to parse the output, and make sure it's a valid parse tree
from
lark
import
Lark
parser
=
Lark
(
sample_sql_statements
)
parser
.
parse
(
content
)
# remove spaces for comparison b/c we removed them in the grammar
ground_truth
=
"SELECT col_1 from table_1 where col_1 = 1"
.
replace
(
" "
,
""
)
assert
content
.
strip
()
==
ground_truth
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
)
@
pytest
.
mark
.
parametrize
(
"logprobs_arg"
,
[
1
,
0
])
async
def
test_echo_logprob_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
logprobs_arg
:
int
):
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# test using text and token IDs
for
prompt
in
(
"Hello, my name is"
,
[
0
,
0
,
0
,
0
,
0
]):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
echo
=
True
,
logprobs
=
logprobs_arg
)
prompt_text
=
tokenizer
.
decode
(
prompt
)
if
isinstance
(
prompt
,
list
)
else
prompt
assert
re
.
search
(
r
"^"
+
prompt_text
,
completion
.
choices
[
0
].
text
)
logprobs
=
completion
.
choices
[
0
].
logprobs
assert
logprobs
is
not
None
assert
len
(
logprobs
.
text_offset
)
>
5
assert
(
len
(
logprobs
.
token_logprobs
)
>
5
and
logprobs
.
token_logprobs
[
0
]
is
None
)
assert
(
len
(
logprobs
.
top_logprobs
)
>
5
and
logprobs
.
top_logprobs
[
0
]
is
None
)
for
top_logprobs
in
logprobs
.
top_logprobs
[
1
:]:
assert
max
(
logprobs_arg
,
1
)
<=
len
(
top_logprobs
)
<=
logprobs_arg
+
1
assert
len
(
logprobs
.
tokens
)
>
5
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_decoding_type_error
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_json_schema
,
sample_regex
):
with
pytest
.
raises
(
openai
.
BadRequestError
):
_
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Give an example JSON that fits this schema: 42"
,
extra_body
=
dict
(
guided_json
=
42
,
guided_decoding_backend
=
guided_decoding_backend
))
with
pytest
.
raises
(
openai
.
BadRequestError
):
_
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Give an example string that fits this regex"
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
guided_json
=
sample_json_schema
))
tests/entrypoints/openai/test_disable_mp.py
0 → 100644
View file @
e7c1b7f3
"""
Repeat of tests in test_completion.py with the non-mp backend.
"""
# imports for guided decoding tests
import
json
import
re
import
shutil
from
tempfile
import
TemporaryDirectory
from
typing
import
List
import
jsonschema
import
openai
# use the official client for correctness check
import
pytest
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
openai
import
BadRequestError
from
transformers
import
AutoTokenizer
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
...utils
import
RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
# technically these adapters use a different base model,
# but we're not testing generation quality here
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
PA_NAME
=
"swapnilbp/llama_tweet_ptune"
# if PA_NAME changes, PA_NUM_VIRTUAL_TOKENS might also
# need to change to match the prompt adapter
PA_NUM_VIRTUAL_TOKENS
=
8
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_added_tokens_files
(
zephyr_lora_files
):
tmp_dir
=
TemporaryDirectory
()
tmp_model_dir
=
f
"
{
tmp_dir
.
name
}
/zephyr"
shutil
.
copytree
(
zephyr_lora_files
,
tmp_model_dir
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_NAME
)
# Copy tokenizer to adapter and add some unique tokens
# 32000, 32001, 32002
added
=
tokenizer
.
add_tokens
([
"vllm1"
,
"vllm2"
,
"vllm3"
],
special_tokens
=
True
)
assert
added
==
3
tokenizer
.
save_pretrained
(
tmp_model_dir
)
yield
tmp_model_dir
tmp_dir
.
cleanup
()
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_pa_files
():
return
snapshot_download
(
repo_id
=
PA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
default_server_args
(
zephyr_lora_files
,
zephyr_lora_added_tokens_files
,
zephyr_pa_files
):
return
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--max-num-seqs"
,
"128"
,
"--enforce-eager"
,
# lora config
"--enable-lora"
,
"--lora-modules"
,
f
"zephyr-lora=
{
zephyr_lora_files
}
"
,
f
"zephyr-lora2=
{
zephyr_lora_added_tokens_files
}
"
,
"--max-lora-rank"
,
"64"
,
"--max-cpu-loras"
,
"2"
,
# pa config
"--enable-prompt-adapter"
,
"--prompt-adapters"
,
f
"zephyr-pa=
{
zephyr_pa_files
}
"
,
f
"zephyr-pa2=
{
zephyr_pa_files
}
"
,
"--max-prompt-adapters"
,
"2"
,
"--max-prompt-adapter-token"
,
"128"
,
"--disable-frontend-multiprocessing"
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
default_server_args
):
with
RemoteOpenAIServer
(
MODEL_NAME
,
default_server_args
)
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
(
server
):
return
server
.
get_async_client
()
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras, then test prompt adapters
"model_name,num_virtual_tokens"
,
[(
MODEL_NAME
,
0
),
(
"zephyr-lora"
,
0
),
(
"zephyr-lora2"
,
0
),
(
"zephyr-pa"
,
PA_NUM_VIRTUAL_TOKENS
),
(
"zephyr-pa2"
,
PA_NUM_VIRTUAL_TOKENS
)],
)
async
def
test_single_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
num_virtual_tokens
:
int
):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
temperature
=
0.0
)
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
choice
=
completion
.
choices
[
0
]
assert
len
(
choice
.
text
)
>=
5
assert
choice
.
finish_reason
==
"length"
assert
completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
5
,
prompt_tokens
=
6
+
num_virtual_tokens
,
total_tokens
=
11
+
num_virtual_tokens
)
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
1
@
pytest
.
mark
.
asyncio
async
def
test_added_lora_tokens
(
client
:
openai
.
AsyncOpenAI
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
"zephyr-lora2"
,
prompt
=
[
0
,
0
,
32000
,
32001
,
32002
],
echo
=
True
,
max_tokens
=
5
,
temperature
=
0.0
,
)
# Added tokens should appear in tokenized prompt
assert
completion
.
choices
[
0
].
text
.
startswith
(
"<unk><unk>vllm1vllm2vllm3"
)
@
pytest
.
mark
.
asyncio
async
def
test_added_lora_tokens_base_model
(
client
:
openai
.
AsyncOpenAI
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
32000
,
32001
,
32002
],
echo
=
True
,
max_tokens
=
5
,
temperature
=
0.0
,
)
# Added tokens should not appear in tokenized prompt
assert
"vllm"
not
in
completion
.
choices
[
0
].
text
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras, then test prompt adapters
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
,
"zephyr-pa"
,
"zephyr-pa2"
],
)
async
def
test_no_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
None
,
)
choice
=
completion
.
choices
[
0
]
assert
choice
.
logprobs
is
None
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# just test 1 lora and 1 pa hereafter
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_zero_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
0
,
)
choice
=
completion
.
choices
[
0
]
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
.
token_logprobs
is
not
None
assert
choice
.
logprobs
.
top_logprobs
is
not
None
assert
len
(
choice
.
logprobs
.
top_logprobs
[
0
])
==
1
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_some_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
5
,
)
choice
=
completion
.
choices
[
0
]
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
.
token_logprobs
is
not
None
assert
choice
.
logprobs
.
top_logprobs
is
not
None
assert
5
<=
len
(
choice
.
logprobs
.
top_logprobs
[
0
])
<=
6
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_too_many_completion_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
with
pytest
.
raises
(
(
openai
.
BadRequestError
,
openai
.
APIError
)):
# test using token IDs
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs
=
21
,
)
...
with
pytest
.
raises
(
(
openai
.
BadRequestError
,
openai
.
APIError
)):
# test using token IDs
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs
=
30
,
stream
=
True
,
)
async
for
chunk
in
stream
:
...
# the server should still work afterwards
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_completion_streaming
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
prompt
=
"What is an LLM?"
single_completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
)
single_output
=
single_completion
.
choices
[
0
].
text
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
)
chunks
:
List
[
str
]
=
[]
finish_reason_count
=
0
async
for
chunk
in
stream
:
chunks
.
append
(
chunk
.
choices
[
0
].
text
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finish_reason_count
+=
1
# finish reason should only return in last block
assert
finish_reason_count
==
1
assert
chunk
.
choices
[
0
].
finish_reason
==
"length"
assert
chunk
.
choices
[
0
].
text
assert
""
.
join
(
chunks
)
==
single_output
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_completion_stream_options
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
prompt
=
"What is the capital of France?"
# Test stream=True, stream_options=
# {"include_usage": False, "continuous_usage_stats": False}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
False
,
"continuous_usage_stats"
:
False
,
})
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
None
# Test stream=True, stream_options=
# {"include_usage": False, "continuous_usage_stats": True}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
False
,
"continuous_usage_stats"
:
True
,
})
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
None
# Test stream=True, stream_options=
# {"include_usage": True, "continuous_usage_stats": False}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
False
,
})
async
for
chunk
in
stream
:
if
chunk
.
choices
[
0
].
finish_reason
is
None
:
assert
chunk
.
usage
is
None
else
:
assert
chunk
.
usage
is
None
final_chunk
=
await
stream
.
__anext__
()
assert
final_chunk
.
usage
is
not
None
assert
final_chunk
.
usage
.
prompt_tokens
>
0
assert
final_chunk
.
usage
.
completion_tokens
>
0
assert
final_chunk
.
usage
.
total_tokens
==
(
final_chunk
.
usage
.
prompt_tokens
+
final_chunk
.
usage
.
completion_tokens
)
assert
final_chunk
.
choices
==
[]
# Test stream=True, stream_options=
# {"include_usage": True, "continuous_usage_stats": True}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
True
,
})
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
not
None
assert
chunk
.
usage
.
prompt_tokens
>
0
assert
chunk
.
usage
.
completion_tokens
>
0
assert
chunk
.
usage
.
total_tokens
==
(
chunk
.
usage
.
prompt_tokens
+
chunk
.
usage
.
completion_tokens
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
final_chunk
=
await
stream
.
__anext__
()
assert
final_chunk
.
usage
is
not
None
assert
final_chunk
.
usage
.
prompt_tokens
>
0
assert
final_chunk
.
usage
.
completion_tokens
>
0
assert
final_chunk
.
usage
.
total_tokens
==
(
final_chunk
.
usage
.
prompt_tokens
+
final_chunk
.
usage
.
completion_tokens
)
assert
final_chunk
.
choices
==
[]
# Test stream=False, stream_options=
# {"include_usage": None}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
None
})
# Test stream=False, stream_options=
# {"include_usage": True}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
True
})
# Test stream=False, stream_options=
# {"continuous_usage_stats": None}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"continuous_usage_stats"
:
None
})
# Test stream=False, stream_options=
# {"continuous_usage_stats": True}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"continuous_usage_stats"
:
True
})
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_batch_completions
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test both text and token IDs
for
prompts
in
([
"Hello, my name is"
]
*
2
,
[[
0
,
0
,
0
,
0
,
0
]]
*
2
):
# test simple list
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompts
,
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
batch
.
choices
)
==
2
assert
batch
.
choices
[
0
].
text
==
batch
.
choices
[
1
].
text
# test n = 2
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompts
,
n
=
2
,
max_tokens
=
5
,
temperature
=
0.0
,
extra_body
=
dict
(
# NOTE: this has to be true for n > 1 in vLLM, but not necessary
# for official client.
use_beam_search
=
True
),
)
assert
len
(
batch
.
choices
)
==
4
assert
batch
.
choices
[
0
].
text
!=
batch
.
choices
[
1
].
text
,
"beam search should be different"
assert
batch
.
choices
[
0
].
text
==
batch
.
choices
[
2
].
text
,
"two copies of the same prompt should be the same"
assert
batch
.
choices
[
1
].
text
==
batch
.
choices
[
3
].
text
,
"two copies of the same prompt should be the same"
# test streaming
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompts
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
)
texts
=
[
""
]
*
2
async
for
chunk
in
batch
:
assert
len
(
chunk
.
choices
)
==
1
choice
=
chunk
.
choices
[
0
]
texts
[
choice
.
index
]
+=
choice
.
text
assert
texts
[
0
]
==
texts
[
1
]
@
pytest
.
mark
.
asyncio
async
def
test_logits_bias
(
client
:
openai
.
AsyncOpenAI
):
prompt
=
"Hello, my name is"
max_tokens
=
5
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# Test exclusive selection
token_id
=
1000
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
logit_bias
=
{
str
(
token_id
):
100
},
seed
=
42
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
5
response_tokens
=
tokenizer
(
completion
.
choices
[
0
].
text
,
add_special_tokens
=
False
)[
"input_ids"
]
expected_tokens
=
tokenizer
(
tokenizer
.
decode
([
token_id
]
*
5
),
add_special_tokens
=
False
)[
"input_ids"
]
assert
all
([
response
==
expected
for
response
,
expected
in
zip
(
response_tokens
,
expected_tokens
)
])
# Test ban
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
)
response_tokens
=
tokenizer
(
completion
.
choices
[
0
].
text
,
add_special_tokens
=
False
)[
"input_ids"
]
first_response
=
completion
.
choices
[
0
].
text
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
logit_bias
=
{
str
(
token
):
-
100
for
token
in
response_tokens
},
)
assert
first_response
!=
completion
.
choices
[
0
].
text
@
pytest
.
mark
.
asyncio
async
def
test_allowed_token_ids
(
client
:
openai
.
AsyncOpenAI
):
prompt
=
"Hello, my name is"
max_tokens
=
1
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# Test exclusive selection
allowed_ids
=
[
21555
,
21557
,
21558
]
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
seed
=
42
,
extra_body
=
dict
(
allowed_token_ids
=
allowed_ids
),
logprobs
=
1
,
)
response_tokens
=
completion
.
choices
[
0
].
logprobs
.
tokens
assert
len
(
response_tokens
)
==
1
assert
tokenizer
.
convert_tokens_to_ids
(
response_tokens
)[
0
]
in
allowed_ids
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_json_completion
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_json_schema
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
f
"Give an example JSON for an employee profile "
f
"that fits this schema:
{
sample_json_schema
}
"
,
n
=
3
,
temperature
=
1.0
,
max_tokens
=
500
,
extra_body
=
dict
(
guided_json
=
sample_json_schema
,
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
len
(
completion
.
choices
)
==
3
for
i
in
range
(
3
):
output_json
=
json
.
loads
(
completion
.
choices
[
i
].
text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
sample_json_schema
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_regex_completion
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_regex
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
f
"Give an example IPv4 address with this regex:
{
sample_regex
}
"
,
n
=
3
,
temperature
=
1.0
,
max_tokens
=
20
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
len
(
completion
.
choices
)
==
3
for
i
in
range
(
3
):
assert
re
.
fullmatch
(
sample_regex
,
completion
.
choices
[
i
].
text
)
is
not
None
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_choice_completion
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_guided_choice
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"The best language for type-safe systems programming is "
,
n
=
2
,
temperature
=
1.0
,
max_tokens
=
10
,
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
,
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
len
(
completion
.
choices
)
==
2
for
i
in
range
(
2
):
assert
completion
.
choices
[
i
].
text
in
sample_guided_choice
@
pytest
.
mark
.
asyncio
async
def
test_guided_grammar
(
client
:
openai
.
AsyncOpenAI
,
sample_sql_statements
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
(
"Generate a sql state that select col_1 from "
"table_1 where it is equals to 1"
),
temperature
=
1.0
,
max_tokens
=
500
,
extra_body
=
dict
(
guided_grammar
=
sample_sql_statements
))
content
=
completion
.
choices
[
0
].
text
# use Lark to parse the output, and make sure it's a valid parse tree
from
lark
import
Lark
parser
=
Lark
(
sample_sql_statements
)
parser
.
parse
(
content
)
# remove spaces for comparison b/c we removed them in the grammar
ground_truth
=
"SELECT col_1 from table_1 where col_1 = 1"
.
replace
(
" "
,
""
)
assert
content
.
strip
()
==
ground_truth
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
)
@
pytest
.
mark
.
parametrize
(
"logprobs_arg"
,
[
1
,
0
])
async
def
test_echo_logprob_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
logprobs_arg
:
int
):
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# test using text and token IDs
for
prompt
in
(
"Hello, my name is"
,
[
0
,
0
,
0
,
0
,
0
]):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
echo
=
True
,
logprobs
=
logprobs_arg
)
prompt_text
=
tokenizer
.
decode
(
prompt
)
if
isinstance
(
prompt
,
list
)
else
prompt
assert
re
.
search
(
r
"^"
+
prompt_text
,
completion
.
choices
[
0
].
text
)
logprobs
=
completion
.
choices
[
0
].
logprobs
assert
logprobs
is
not
None
assert
len
(
logprobs
.
text_offset
)
>
5
assert
(
len
(
logprobs
.
token_logprobs
)
>
5
and
logprobs
.
token_logprobs
[
0
]
is
None
)
assert
(
len
(
logprobs
.
top_logprobs
)
>
5
and
logprobs
.
top_logprobs
[
0
]
is
None
)
for
top_logprobs
in
logprobs
.
top_logprobs
[
1
:]:
assert
max
(
logprobs_arg
,
1
)
<=
len
(
top_logprobs
)
<=
logprobs_arg
+
1
assert
len
(
logprobs
.
tokens
)
>
5
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_decoding_type_error
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_json_schema
,
sample_regex
):
with
pytest
.
raises
(
openai
.
BadRequestError
):
_
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Give an example JSON that fits this schema: 42"
,
extra_body
=
dict
(
guided_json
=
42
,
guided_decoding_backend
=
guided_decoding_backend
))
with
pytest
.
raises
(
openai
.
BadRequestError
):
_
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Give an example string that fits this regex"
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
guided_json
=
sample_json_schema
))
tests/entrypoints/
test_
openai_embedding.py
→
tests/entrypoints/openai
/test
_embedding.py
View file @
e7c1b7f3
import
base64
import
numpy
as
np
import
openai
import
openai
import
pytest
import
pytest
import
ray
from
..utils
import
VLLM_PATH
,
RemoteOpenAIServer
from
..
.
utils
import
RemoteOpenAIServer
EMBEDDING_MODEL_NAME
=
"intfloat/e5-mistral-7b-instruct"
EMBEDDING_MODEL_NAME
=
"intfloat/e5-mistral-7b-instruct"
pytestmark
=
pytest
.
mark
.
openai
@
pytest
.
fixture
(
scope
=
"module"
)
def
ray_ctx
():
ray
.
init
(
runtime_env
=
{
"working_dir"
:
VLLM_PATH
})
yield
ray
.
shutdown
()
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
embedding_server
(
ray_ctx
):
def
embedding_server
():
return
RemoteOpenAIServer
([
args
=
[
"--model"
,
EMBEDDING_MODEL_NAME
,
# use half precision for speed and memory savings in CI environment
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"--dtype"
,
"bfloat16"
,
"bfloat16"
,
"--enforce-eager"
,
"--enforce-eager"
,
"--max-model-len"
,
"--max-model-len"
,
"8192"
,
"8192"
,
"--enforce-eager"
,
]
])
with
RemoteOpenAIServer
(
EMBEDDING_MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
@@ -111,3 +104,33 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
...
@@ -111,3 +104,33 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
17
assert
embeddings
.
usage
.
prompt_tokens
==
17
assert
embeddings
.
usage
.
total_tokens
==
17
assert
embeddings
.
usage
.
total_tokens
==
17
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
EMBEDDING_MODEL_NAME
],
)
async
def
test_batch_base64_embedding
(
embedding_client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
input_texts
=
[
"Hello my name is"
,
"The best thing about vLLM is that it supports many different models"
]
responses_float
=
await
embedding_client
.
embeddings
.
create
(
input
=
input_texts
,
model
=
model_name
,
encoding_format
=
"float"
)
responses_base64
=
await
embedding_client
.
embeddings
.
create
(
input
=
input_texts
,
model
=
model_name
,
encoding_format
=
"base64"
)
decoded_responses_base64_data
=
[]
for
data
in
responses_base64
.
data
:
decoded_responses_base64_data
.
append
(
np
.
frombuffer
(
base64
.
b64decode
(
data
.
embedding
),
dtype
=
"float"
).
tolist
())
assert
responses_float
.
data
[
0
].
embedding
==
decoded_responses_base64_data
[
0
]
assert
responses_float
.
data
[
1
].
embedding
==
decoded_responses_base64_data
[
1
]
tests/entrypoints/test_guided_processors.py
→
tests/entrypoints/
openai/
test_guided_processors.py
View file @
e7c1b7f3
...
@@ -10,61 +10,17 @@ from vllm.model_executor.guided_decoding import (
...
@@ -10,61 +10,17 @@ from vllm.model_executor.guided_decoding import (
from
vllm.model_executor.guided_decoding.outlines_logits_processors
import
(
from
vllm.model_executor.guided_decoding.outlines_logits_processors
import
(
JSONLogitsProcessor
,
RegexLogitsProcessor
)
JSONLogitsProcessor
,
RegexLogitsProcessor
)
TEST_SCHEMA
=
{
"type"
:
"object"
,
"properties"
:
{
"name"
:
{
"type"
:
"string"
},
"age"
:
{
"type"
:
"integer"
},
"skills"
:
{
"type"
:
"array"
,
"items"
:
{
"type"
:
"string"
,
"maxLength"
:
10
},
"minItems"
:
3
},
"work history"
:
{
"type"
:
"array"
,
"items"
:
{
"type"
:
"object"
,
"properties"
:
{
"company"
:
{
"type"
:
"string"
},
"duration"
:
{
"type"
:
"string"
},
"position"
:
{
"type"
:
"string"
}
},
"required"
:
[
"company"
,
"position"
]
}
}
},
"required"
:
[
"name"
,
"age"
,
"skills"
,
"work history"
]
}
TEST_REGEX
=
(
r
"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
def
test_guided_logits_processors
(
sample_regex
,
sample_json_schema
):
r
"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
)
pytestmark
=
pytest
.
mark
.
openai
def
test_guided_logits_processors
():
"""Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
"""Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
tokenizer
=
AutoTokenizer
.
from_pretrained
(
'HuggingFaceH4/zephyr-7b-beta'
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
'HuggingFaceH4/zephyr-7b-beta'
)
regex_LP
=
RegexLogitsProcessor
(
TEST_REGEX
,
tokenizer
)
regex_LP
=
RegexLogitsProcessor
(
sample_regex
,
tokenizer
)
json_LP
=
JSONLogitsProcessor
(
TEST_SCHEMA
,
json_LP
=
JSONLogitsProcessor
(
sample_json_schema
,
tokenizer
,
tokenizer
,
whitespace_pattern
=
None
)
whitespace_pattern
=
None
)
token_ids
=
tokenizer
.
encode
(
token_ids
=
tokenizer
.
encode
(
f
"Give an example IPv4 address with this regex:
{
TEST_REGEX
}
"
)
f
"Give an example IPv4 address with this regex:
{
sample_regex
}
"
)
tensor
=
torch
.
rand
(
32000
)
tensor
=
torch
.
rand
(
32000
)
original_tensor
=
torch
.
clone
(
tensor
)
original_tensor
=
torch
.
clone
(
tensor
)
regex_LP
(
token_ids
,
tensor
)
regex_LP
(
token_ids
,
tensor
)
...
@@ -72,7 +28,8 @@ def test_guided_logits_processors():
...
@@ -72,7 +28,8 @@ def test_guided_logits_processors():
assert
not
torch
.
allclose
(
tensor
,
original_tensor
)
assert
not
torch
.
allclose
(
tensor
,
original_tensor
)
token_ids
=
tokenizer
.
encode
(
token_ids
=
tokenizer
.
encode
(
f
"Give an employee profile that fits this schema:
{
TEST_SCHEMA
}
"
)
f
"Give an employee profile that fits this schema:
{
sample_json_schema
}
"
)
tensor
=
torch
.
rand
(
32000
)
tensor
=
torch
.
rand
(
32000
)
original_tensor
=
torch
.
clone
(
tensor
)
original_tensor
=
torch
.
clone
(
tensor
)
json_LP
(
token_ids
,
tensor
)
json_LP
(
token_ids
,
tensor
)
...
@@ -82,13 +39,14 @@ def test_guided_logits_processors():
...
@@ -82,13 +39,14 @@ def test_guided_logits_processors():
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_logits_processor_black_box
(
backend
:
str
):
async
def
test_guided_logits_processor_black_box
(
backend
:
str
,
sample_regex
,
sample_json_schema
):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
'HuggingFaceH4/zephyr-7b-beta'
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
'HuggingFaceH4/zephyr-7b-beta'
)
token_ids
=
tokenizer
.
encode
(
token_ids
=
tokenizer
.
encode
(
f
"Give an example IPv4 address with this regex:
{
TEST_REGEX
}
"
)
f
"Give an example IPv4 address with this regex:
{
sample_regex
}
"
)
regex_request
=
CompletionRequest
(
model
=
'test'
,
regex_request
=
CompletionRequest
(
model
=
'test'
,
prompt
=
token_ids
,
prompt
=
token_ids
,
guided_regex
=
TEST_REGEX
)
guided_regex
=
sample_regex
)
regex_lp
=
await
get_guided_decoding_logits_processor
(
regex_lp
=
await
get_guided_decoding_logits_processor
(
backend
,
regex_request
,
tokenizer
)
backend
,
regex_request
,
tokenizer
)
assert
regex_lp
is
not
None
assert
regex_lp
is
not
None
...
@@ -99,10 +57,11 @@ async def test_guided_logits_processor_black_box(backend: str):
...
@@ -99,10 +57,11 @@ async def test_guided_logits_processor_black_box(backend: str):
assert
not
torch
.
allclose
(
tensor
,
original_tensor
)
assert
not
torch
.
allclose
(
tensor
,
original_tensor
)
token_ids
=
tokenizer
.
encode
(
token_ids
=
tokenizer
.
encode
(
f
"Give an employee profile that fits this schema:
{
TEST_SCHEMA
}
"
)
f
"Give an employee profile that fits this schema:
{
sample_json_schema
}
"
)
json_request
=
CompletionRequest
(
model
=
'test'
,
json_request
=
CompletionRequest
(
model
=
'test'
,
prompt
=
token_ids
,
prompt
=
token_ids
,
guided_json
=
TEST_SCHEMA
)
guided_json
=
sample_json_schema
)
json_lp
=
await
get_guided_decoding_logits_processor
(
json_lp
=
await
get_guided_decoding_logits_processor
(
backend
,
json_request
,
tokenizer
)
backend
,
json_request
,
tokenizer
)
assert
json_lp
is
not
None
assert
json_lp
is
not
None
...
...
tests/entrypoints/openai/test_models.py
0 → 100644
View file @
e7c1b7f3
import
openai
# use the official client for correctness check
import
pytest
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
...utils
import
RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
zephyr_lora_files
):
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
# lora config below
"--enable-lora"
,
"--lora-modules"
,
f
"zephyr-lora=
{
zephyr_lora_files
}
"
,
f
"zephyr-lora2=
{
zephyr_lora_files
}
"
,
"--max-lora-rank"
,
"64"
,
"--max-cpu-loras"
,
"2"
,
"--max-num-seqs"
,
"128"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
(
server
):
return
server
.
get_async_client
()
@
pytest
.
mark
.
asyncio
async
def
test_check_models
(
client
:
openai
.
AsyncOpenAI
):
models
=
await
client
.
models
.
list
()
models
=
models
.
data
served_model
=
models
[
0
]
lora_models
=
models
[
1
:]
assert
served_model
.
id
==
MODEL_NAME
assert
all
(
model
.
root
==
MODEL_NAME
for
model
in
models
)
assert
lora_models
[
0
].
id
==
"zephyr-lora"
assert
lora_models
[
1
].
id
==
"zephyr-lora2"
tests/entrypoints/
test_server
_oot_registration.py
→
tests/entrypoints/
openai/test
_oot_registration.py
View file @
e7c1b7f3
import
sys
import
sys
import
time
import
time
import
pytest
import
torch
import
torch
from
openai
import
OpenAI
,
OpenAIError
from
openai
import
OpenAI
,
OpenAIError
...
@@ -10,8 +9,6 @@ from vllm.model_executor.models.opt import OPTForCausalLM
...
@@ -10,8 +9,6 @@ from vllm.model_executor.models.opt import OPTForCausalLM
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.utils
import
get_open_port
from
vllm.utils
import
get_open_port
pytestmark
=
pytest
.
mark
.
openai
class
MyOPTForCausalLM
(
OPTForCausalLM
):
class
MyOPTForCausalLM
(
OPTForCausalLM
):
...
@@ -39,10 +36,12 @@ def test_oot_registration_for_api_server():
...
@@ -39,10 +36,12 @@ def test_oot_registration_for_api_server():
ctx
=
torch
.
multiprocessing
.
get_context
()
ctx
=
torch
.
multiprocessing
.
get_context
()
server
=
ctx
.
Process
(
target
=
server_function
,
args
=
(
port
,
))
server
=
ctx
.
Process
(
target
=
server_function
,
args
=
(
port
,
))
server
.
start
()
server
.
start
()
MAX_SERVER_START_WAIT_S
=
60
client
=
OpenAI
(
client
=
OpenAI
(
base_url
=
f
"http://localhost:
{
port
}
/v1"
,
base_url
=
f
"http://localhost:
{
port
}
/v1"
,
api_key
=
"token-abc123"
,
api_key
=
"token-abc123"
,
)
)
now
=
time
.
time
()
while
True
:
while
True
:
try
:
try
:
completion
=
client
.
chat
.
completions
.
create
(
completion
=
client
.
chat
.
completions
.
create
(
...
@@ -60,6 +59,8 @@ def test_oot_registration_for_api_server():
...
@@ -60,6 +59,8 @@ def test_oot_registration_for_api_server():
except
OpenAIError
as
e
:
except
OpenAIError
as
e
:
if
"Connection error"
in
str
(
e
):
if
"Connection error"
in
str
(
e
):
time
.
sleep
(
3
)
time
.
sleep
(
3
)
if
time
.
time
()
-
now
>
MAX_SERVER_START_WAIT_S
:
raise
RuntimeError
(
"Server did not start in time"
)
from
e
else
:
else
:
raise
e
raise
e
server
.
kill
()
server
.
kill
()
...
...
tests/entrypoints/openai/test_return_tokens_as_ids.py
0 → 100644
View file @
e7c1b7f3
# Separate these tests out from test_completion and test_chat, because they
# require launching a second server with a different flag. Running both servers
# at the same time on a single node will OOM.
import
pytest
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
...utils
import
RemoteOpenAIServer
from
.test_completion
import
default_server_args
# noqa: F401
from
.test_completion
import
zephyr_lora_added_tokens_files
# noqa: F401
from
.test_completion
import
zephyr_lora_files
# noqa: F401
from
.test_completion
import
zephyr_pa_files
# noqa: F401
from
.test_completion
import
MODEL_NAME
@
pytest
.
fixture
(
scope
=
"module"
)
def
server_with_return_tokens_as_token_ids_flag
(
default_server_args
):
# noqa: F811
args_with_flag
=
default_server_args
+
[
"--return-tokens-as-token-ids"
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args_with_flag
)
as
remote_server
:
yield
remote_server
@
pytest
.
mark
.
asyncio
async
def
test_completion_return_tokens_as_token_ids_completion
(
server_with_return_tokens_as_token_ids_flag
):
client
=
server_with_return_tokens_as_token_ids_flag
.
get_async_client
()
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
# Include Unicode characters to test for dividing a single
# character across multiple tokens: 🎉 is [28705, 31862] for the
# Zephyr tokenizer
prompt
=
"Say 'Hello, world! 🎉'"
,
echo
=
True
,
temperature
=
0
,
max_tokens
=
10
,
logprobs
=
1
)
text
=
completion
.
choices
[
0
].
text
token_strs
=
completion
.
choices
[
0
].
logprobs
.
tokens
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# Check that the token representations are consistent between raw tokens
# and top_logprobs
# Slice off the first one, because there's no scoring associated with BOS
top_logprobs
=
completion
.
choices
[
0
].
logprobs
.
top_logprobs
[
1
:]
top_logprob_keys
=
[
next
(
iter
(
logprob_by_tokens
))
for
logprob_by_tokens
in
top_logprobs
]
assert
token_strs
[
1
:]
==
top_logprob_keys
# Check that decoding the tokens gives the expected text
tokens
=
[
int
(
token
.
removeprefix
(
"token_id:"
))
for
token
in
token_strs
]
assert
text
==
tokenizer
.
decode
(
tokens
,
skip_special_tokens
=
True
)
@
pytest
.
mark
.
asyncio
async
def
test_chat_return_tokens_as_token_ids_completion
(
server_with_return_tokens_as_token_ids_flag
):
client
=
server_with_return_tokens_as_token_ids_flag
.
get_async_client
()
response
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
# Include Unicode characters to test for dividing a single
# character across multiple tokens: 🎉 is [28705, 31862] for the
# Zephyr tokenizer
messages
=
[{
"role"
:
"system"
,
"content"
:
"You like to respond in only emojis, like 🎉"
},
{
"role"
:
"user"
,
"content"
:
"Please write some emojis: 🐱🐶🎉"
}],
temperature
=
0
,
max_tokens
=
8
,
logprobs
=
True
)
text
=
response
.
choices
[
0
].
message
.
content
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
token_ids
=
[]
for
logprob_content
in
response
.
choices
[
0
].
logprobs
.
content
:
token_ids
.
append
(
int
(
logprob_content
.
token
.
removeprefix
(
"token_id:"
)))
assert
tokenizer
.
decode
(
token_ids
,
skip_special_tokens
=
True
)
==
text
tests/entrypoints/
test_
openai_run_batch.py
→
tests/entrypoints/openai
/test
_run_batch.py
View file @
e7c1b7f3
...
@@ -6,7 +6,8 @@ from vllm.entrypoints.openai.protocol import BatchRequestOutput
...
@@ -6,7 +6,8 @@ from vllm.entrypoints.openai.protocol import BatchRequestOutput
# ruff: noqa: E501
# ruff: noqa: E501
INPUT_BATCH
=
"""{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
INPUT_BATCH
=
"""{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
INVALID_INPUT_BATCH
=
"""{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
INVALID_INPUT_BATCH
=
"""{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
...
...
Prev
1
…
11
12
13
14
15
16
17
18
19
…
23
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment