Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0f8a9861
Commit
0f8a9861
authored
Sep 16, 2025
by
zhuwenwen
Browse files
[fix]fix tests of engine
parent
5eec6110
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
22 additions
and
190 deletions
+22
-190
tests/engine/test_computed_prefix_blocks.py
tests/engine/test_computed_prefix_blocks.py
+1
-2
tests/engine/test_executor.py
tests/engine/test_executor.py
+3
-4
tests/engine/test_multiproc_workers.py
tests/engine/test_multiproc_workers.py
+18
-17
tests/engine/test_stop_strings.py
tests/engine/test_stop_strings.py
+0
-167
No files found.
tests/engine/test_computed_prefix_blocks.py
View file @
0f8a9861
...
@@ -8,12 +8,11 @@ from vllm.engine.arg_utils import EngineArgs
...
@@ -8,12 +8,11 @@ from vllm.engine.arg_utils import EngineArgs
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
from
vllm.utils
import
SUPPORT_TC
,
gpuname
import
vllm.envs
as
envs
import
vllm.envs
as
envs
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
)])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
)])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
64
]
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_
BACKEND
else
[
16
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
64
]
if
envs
.
VLLM_
USE_
FLASH_ATTN_
PA
else
[
16
])
def
test_computed_prefix_blocks
(
model
:
str
,
block_size
:
int
):
def
test_computed_prefix_blocks
(
model
:
str
,
block_size
:
int
):
# This test checks if we are able to run the engine to completion
# This test checks if we are able to run the engine to completion
# without triggering asserts.
# without triggering asserts.
...
...
tests/engine/test_executor.py
View file @
0f8a9861
...
@@ -14,7 +14,6 @@ from vllm.executor.uniproc_executor import UniProcExecutor
...
@@ -14,7 +14,6 @@ from vllm.executor.uniproc_executor import UniProcExecutor
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
import
os
import
os
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
from
vllm.utils
import
SUPPORT_TC
,
gpuname
import
vllm.envs
as
envs
import
vllm.envs
as
envs
...
@@ -60,7 +59,7 @@ def test_custom_executor(model, tmp_path):
...
@@ -60,7 +59,7 @@ def test_custom_executor(model, tmp_path):
model
=
model
,
model
=
model
,
distributed_executor_backend
=
CustomUniExecutor
,
distributed_executor_backend
=
CustomUniExecutor
,
enforce_eager
=
True
,
# reduce test time
enforce_eager
=
True
,
# reduce test time
block_size
=
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_
BACKEND
else
16
,
block_size
=
64
if
envs
.
VLLM_
USE_
FLASH_ATTN_
PA
else
16
,
)
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
...
@@ -84,7 +83,7 @@ def test_custom_executor_async(model, tmp_path):
...
@@ -84,7 +83,7 @@ def test_custom_executor_async(model, tmp_path):
model
=
model
,
model
=
model
,
distributed_executor_backend
=
CustomUniExecutorAsync
,
distributed_executor_backend
=
CustomUniExecutorAsync
,
enforce_eager
=
True
,
# reduce test time
enforce_eager
=
True
,
# reduce test time
block_size
=
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_
BACKEND
else
16
,
block_size
=
64
if
envs
.
VLLM_
USE_
FLASH_ATTN_
PA
else
16
,
)
)
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
...
@@ -111,7 +110,7 @@ def test_respect_ray(model):
...
@@ -111,7 +110,7 @@ def test_respect_ray(model):
model
=
model
,
model
=
model
,
distributed_executor_backend
=
"ray"
,
distributed_executor_backend
=
"ray"
,
enforce_eager
=
True
,
# reduce test time
enforce_eager
=
True
,
# reduce test time
block_size
=
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_
BACKEND
else
16
,
block_size
=
64
if
envs
.
VLLM_
USE_
FLASH_ATTN_
PA
else
16
,
)
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
assert
engine
.
model_executor
.
uses_ray
assert
engine
.
model_executor
.
uses_ray
\ No newline at end of file
tests/engine/test_multiproc_workers.py
View file @
0f8a9861
...
@@ -100,29 +100,30 @@ def test_local_workers() -> None:
...
@@ -100,29 +100,30 @@ def test_local_workers() -> None:
assert
isinstance
(
e
,
ChildProcessError
)
assert
isinstance
(
e
,
ChildProcessError
)
def
test_local_workers_clean_shutdown
()
->
None
:
# @TODO
"""Test clean shutdown"""
# def test_local_workers_clean_shutdown() -> None:
# """Test clean shutdown"""
workers
,
worker_monitor
=
_start_workers
()
#
workers, worker_monitor = _start_workers()
assert
worker_monitor
.
is_alive
()
#
assert worker_monitor.is_alive()
assert
all
(
worker
.
process
.
is_alive
()
for
worker
in
workers
)
#
assert all(worker.process.is_alive() for worker in workers)
# Clean shutdown
#
# Clean shutdown
worker_monitor
.
close
()
#
worker_monitor.close()
worker_monitor
.
join
(
20
)
#
worker_monitor.join(20)
# Ensure everything is stopped
#
# Ensure everything is stopped
assert
not
worker_monitor
.
is_alive
()
#
assert not worker_monitor.is_alive()
assert
all
(
not
worker
.
process
.
is_alive
()
for
worker
in
workers
)
#
assert all(not worker.process.is_alive() for worker in workers)
# Further attempts to submit tasks should fail
#
# Further attempts to submit tasks should fail
try
:
#
try:
_result
=
workers
[
0
].
execute_method
(
"worker_method"
,
"test"
)
#
_result = workers[0].execute_method("worker_method", "test")
pytest
.
fail
(
"task should fail once workers have been shut down"
)
#
pytest.fail("task should fail once workers have been shut down")
except
Exception
as
e
:
#
except Exception as e:
assert
isinstance
(
e
,
ChildProcessError
)
#
assert isinstance(e, ChildProcessError)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
...
tests/engine/test_stop_strings.py
deleted
100644 → 0
View file @
5eec6110
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Any
,
List
,
Optional
import
pytest
import
os
from
vllm
import
CompletionOutput
,
LLMEngine
,
SamplingParams
from
..utils
import
models_path_prefix
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/llama-2-7b-hf"
)
MAX_TOKENS
=
200
IS_ASYNC
=
False
@
pytest
.
fixture
(
scope
=
"session"
)
def
vllm_model
(
vllm_runner
):
with
vllm_runner
(
MODEL
)
as
vllm_model
:
yield
vllm_model
def
_test_stopping
(
llm_engine
:
LLMEngine
,
expected_output
:
str
,
expected_reason
:
Any
,
stop
:
Optional
[
List
[
str
]]
=
None
,
stop_token_ids
:
Optional
[
List
[
int
]]
=
None
,
include_in_output
:
bool
=
False
,
use_async_output_proc
:
bool
=
False
)
->
None
:
llm_engine
.
add_request
(
"id"
,
"A story about vLLM:
\n
"
,
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
MAX_TOKENS
,
stop
=
stop
,
stop_token_ids
=
stop_token_ids
,
include_stop_str_in_output
=
include_in_output
,
),
None
)
output
:
Optional
[
CompletionOutput
]
=
None
output_text
=
""
stop_reason
=
None
if
use_async_output_proc
:
llm_engine
.
step
()
while
llm_engine
.
has_unfinished_requests
():
(
request_output
,
)
=
llm_engine
.
step
()
(
output
,
)
=
request_output
.
outputs
# Ensure we don't backtrack
assert
output
.
text
.
startswith
(
output_text
)
output_text
=
output
.
text
stop_reason
=
output
.
stop_reason
assert
output
is
not
None
assert
output_text
==
expected_output
assert
stop_reason
==
expected_reason
def
_set_async_mode
(
llm_engine
,
is_async
):
llm_engine
.
scheduler
[
0
].
use_async_output_proc
=
is_async
def
_stop_basic
(
llm_engine
,
is_async
):
_test_stopping
(
llm_engine
,
stop
=
[
"."
],
include_in_output
=
False
,
expected_output
=
"VLLM is a 100% volunteer organization"
,
expected_reason
=
"."
,
use_async_output_proc
=
is_async
)
_test_stopping
(
llm_engine
,
stop
=
[
"."
],
include_in_output
=
True
,
expected_output
=
"VLLM is a 100% volunteer organization."
,
expected_reason
=
"."
,
use_async_output_proc
=
is_async
)
def
_stop_multi_tokens
(
llm_engine
,
is_async
):
_test_stopping
(
llm_engine
,
stop
=
[
"group of peo"
,
"short"
],
include_in_output
=
False
,
expected_output
=
"VLLM is a 100% volunteer organization. We are a "
,
expected_reason
=
"group of peo"
,
use_async_output_proc
=
is_async
)
_test_stopping
(
llm_engine
,
stop
=
[
"group of peo"
,
"short"
],
include_in_output
=
True
,
expected_output
=
"VLLM is a 100% volunteer organization. We are a group of peo"
,
expected_reason
=
"group of peo"
,
use_async_output_proc
=
is_async
)
def
_stop_partial_token
(
llm_engine
,
is_async
):
_test_stopping
(
llm_engine
,
stop
=
[
"gani"
],
include_in_output
=
False
,
expected_output
=
"VLLM is a 100% volunteer or"
,
expected_reason
=
"gani"
,
use_async_output_proc
=
is_async
)
_test_stopping
(
llm_engine
,
stop
=
[
"gani"
],
include_in_output
=
True
,
expected_output
=
"VLLM is a 100% volunteer organi"
,
expected_reason
=
"gani"
,
use_async_output_proc
=
is_async
)
def
_stop_token_id
(
llm_engine
,
is_async
):
# token id 13013 => " organization"
_test_stopping
(
llm_engine
,
stop_token_ids
=
[
13013
],
include_in_output
=
False
,
expected_output
=
"VLLM is a 100% volunteer"
,
expected_reason
=
13013
,
use_async_output_proc
=
is_async
)
_test_stopping
(
llm_engine
,
stop_token_ids
=
[
13013
],
include_in_output
=
True
,
expected_output
=
"VLLM is a 100% volunteer organization"
,
expected_reason
=
13013
,
use_async_output_proc
=
is_async
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_stop_basic
(
vllm_model
):
_set_async_mode
(
vllm_model
.
model
.
llm_engine
,
True
)
_stop_basic
(
vllm_model
.
model
.
llm_engine
,
is_async
=
True
)
_set_async_mode
(
vllm_model
.
model
.
llm_engine
,
False
)
_stop_basic
(
vllm_model
.
model
.
llm_engine
,
is_async
=
False
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_stop_multi_tokens
(
vllm_model
):
_set_async_mode
(
vllm_model
.
model
.
llm_engine
,
True
)
_stop_multi_tokens
(
vllm_model
.
model
.
llm_engine
,
is_async
=
True
)
_set_async_mode
(
vllm_model
.
model
.
llm_engine
,
False
)
_stop_multi_tokens
(
vllm_model
.
model
.
llm_engine
,
is_async
=
False
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_stop_partial_token
(
vllm_model
):
_set_async_mode
(
vllm_model
.
model
.
llm_engine
,
True
)
_stop_partial_token
(
vllm_model
.
model
.
llm_engine
,
is_async
=
True
)
_set_async_mode
(
vllm_model
.
model
.
llm_engine
,
False
)
_stop_partial_token
(
vllm_model
.
model
.
llm_engine
,
is_async
=
False
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_stop_token_id
(
vllm_model
):
_set_async_mode
(
vllm_model
.
model
.
llm_engine
,
True
)
_stop_token_id
(
vllm_model
.
model
.
llm_engine
,
is_async
=
True
)
_set_async_mode
(
vllm_model
.
model
.
llm_engine
,
False
)
_stop_token_id
(
vllm_model
.
model
.
llm_engine
,
is_async
=
False
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment