Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2e3bfb1e
Commit
2e3bfb1e
authored
Jun 04, 2025
by
zhuwenwen
Browse files
[tests] update v1 tests
parent
87d06573
Changes
17
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
67 additions
and
46 deletions
+67
-46
tests/samplers/test_beam_search.py
tests/samplers/test_beam_search.py
+3
-3
tests/spec_decode/e2e/test_integration.py
tests/spec_decode/e2e/test_integration.py
+5
-5
tests/v1/core/__init__.py
tests/v1/core/__init__.py
+0
-0
tests/v1/core/test_kv_cache_utils.py
tests/v1/core/test_kv_cache_utils.py
+4
-2
tests/v1/core/test_scheduler.py
tests/v1/core/test_scheduler.py
+9
-7
tests/v1/core/test_scheduler_e2e.py
tests/v1/core/test_scheduler_e2e.py
+3
-2
tests/v1/shutdown/__init__.py
tests/v1/shutdown/__init__.py
+0
-0
tests/v1/shutdown/test_delete.py
tests/v1/shutdown/test_delete.py
+4
-2
tests/v1/shutdown/test_forward_error.py
tests/v1/shutdown/test_forward_error.py
+4
-2
tests/v1/shutdown/test_processor_error.py
tests/v1/shutdown/test_processor_error.py
+4
-2
tests/v1/shutdown/test_startup_error.py
tests/v1/shutdown/test_startup_error.py
+5
-3
tests/v1/spec_decode/__init__.py
tests/v1/spec_decode/__init__.py
+0
-0
tests/v1/spec_decode/test_max_len.py
tests/v1/spec_decode/test_max_len.py
+5
-3
tests/v1/spec_decode/test_ngram.py
tests/v1/spec_decode/test_ngram.py
+5
-3
tests/v1/test_async_llm_dp.py
tests/v1/test_async_llm_dp.py
+3
-2
tests/v1/test_oracle.py
tests/v1/test_oracle.py
+8
-7
tests/v1/worker/test_gpu_model_runner.py
tests/v1/worker/test_gpu_model_runner.py
+5
-3
No files found.
tests/samplers/test_beam_search.py
View file @
2e3bfb1e
...
...
@@ -6,10 +6,10 @@ Run `pytest tests/samplers/test_beam_search.py`.
import
pytest
import
os
from
..utils
import
models_path_prefix
from
transformers
import
AutoModelForSeq2SeqLM
from
vllm.assets.audio
import
AudioAsset
from
..utils
import
models_path_prefix
@
pytest
.
fixture
(
autouse
=
True
)
...
...
@@ -83,7 +83,7 @@ def test_beam_search_passes_multimodal_data(
# correctly. As such, we just need to check one extra modality to make
# sure things pass through properly.
audios
=
[
AudioAsset
(
"mary_had_lamb"
).
audio_and_sample_rate
]
model
=
"Qwen/Qwen2-Audio-7B-Instruct"
model
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-Audio-7B-Instruct"
)
audio_seq
=
"<|audio_bos|><|AUDIO|><|audio_eos|>"
prompts
=
[
f
"<|im_start|>user
\n
{
audio_seq
}
Can you transcribe this?<|im_end|>
\n
<|im_start|>assistant
\n
"
#noqa: E501
...
...
@@ -140,4 +140,4 @@ def test_beam_search_passes_multimodal_data(
assert
filtered_hf_output_ids
[
-
1
]
==
eos_token_id
filtered_hf_output_ids
=
filtered_hf_output_ids
[:
-
1
]
assert
filtered_hf_output_ids
==
filtered_vllm_output_ids
assert
filtered_hf_output_ids
==
filtered_vllm_output_ids
\ No newline at end of file
tests/spec_decode/e2e/test_integration.py
View file @
2e3bfb1e
...
...
@@ -69,7 +69,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Explicitly specify draft model quantization
{
"speculative_config"
:
{
"model"
:
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
)
,
"num_speculative_tokens"
:
5
,
"quantization"
:
"gptq"
,
},
...
...
@@ -77,7 +77,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Explicitly specify GPTQ-based draft model to use marlin quantization
{
"speculative_config"
:
{
"model"
:
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
)
,
"num_speculative_tokens"
:
5
,
"quantization"
:
"marlin"
,
},
...
...
@@ -85,7 +85,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Not explicitly specify draft model quantization
{
"speculative_config"
:
{
"model"
:
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
)
,
"num_speculative_tokens"
:
5
,
"quantization"
:
None
,
},
...
...
@@ -124,7 +124,7 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"speculative_config"
:
{
"model"
:
"JackFram/llama-68m"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
,
"num_speculative_tokens"
:
3
,
"disable_mqa_scorer"
:
True
,
},
...
...
@@ -151,4 +151,4 @@ def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
batch_size
,
max_output_len
=
output_len
,
seed
=
seed
,
temperature
=
0.0
)
temperature
=
0.0
)
\ No newline at end of file
tests/v1/core/__init__.py
0 → 100644
View file @
2e3bfb1e
tests/v1/core/test_kv_cache_utils.py
View file @
2e3bfb1e
# SPDX-License-Identifier: Apache-2.0
import
os
import
pytest
import
torch
...
...
@@ -22,6 +23,7 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
KVCacheGroupSpec
,
KVCacheTensor
)
from
vllm.v1.metrics.stats
import
PrefixCacheStats
from
vllm.v1.request
import
Request
from
...utils
import
models_path_prefix
# yapf: enable
...
...
@@ -432,8 +434,8 @@ def test_unify_kv_cache_configs():
@
pytest
.
mark
.
parametrize
(
(
"model_id"
,
"max_model_len"
,
"want_estimated_max_len"
),
[
(
"Qwen/Qwen1.5-7B"
,
16385
,
16384
),
(
"Qwen/Qwen1.5-7B"
,
16383
,
16383
),
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen1.5-7B"
)
,
16385
,
16384
),
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen1.5-7B"
)
,
16383
,
16383
),
])
def
test_estimate_max_model_len
(
model_id
,
max_model_len
,
want_estimated_max_len
):
...
...
tests/v1/core/test_scheduler.py
View file @
2e3bfb1e
...
...
@@ -2,6 +2,7 @@
from
typing
import
Optional
from
unittest.mock
import
Mock
import
os
import
pytest
import
torch
...
...
@@ -16,12 +17,13 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
from
vllm.v1.outputs
import
ModelRunnerOutput
from
vllm.v1.request
import
Request
,
RequestStatus
from
vllm.v1.structured_output
import
StructuredOutputManager
from
...utils
import
models_path_prefix
EOS_TOKEN_ID
=
50256
def
create_scheduler
(
model
:
str
=
"facebook/opt-125m"
,
model
:
str
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
max_num_seqs
:
int
=
16
,
max_num_batched_tokens
:
int
=
8192
,
enable_prefix_caching
:
Optional
[
bool
]
=
None
,
...
...
@@ -211,7 +213,7 @@ def test_schedule(enable_prefix_caching: Optional[bool],
def
test_schedule_multimodal_requests
():
scheduler
=
create_scheduler
(
model
=
"llava-hf/llava-1.5-7b-hf"
)
scheduler
=
create_scheduler
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-1.5-7b-hf"
)
)
mm_positions
=
[[
PlaceholderRange
(
offset
=
i
,
length
=
100
)]
for
i
in
range
(
10
)]
requests
=
create_requests
(
...
...
@@ -243,7 +245,7 @@ def test_schedule_partial_requests():
there is insufficient encoder budget.
"""
scheduler
=
create_scheduler
(
model
=
"llava-hf/llava-1.5-7b-hf"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-1.5-7b-hf"
)
,
max_num_batched_tokens
=
1024
,
)
mm_positions
=
[[
PlaceholderRange
(
offset
=
100
,
length
=
600
)]
...
...
@@ -303,7 +305,7 @@ def test_schedule_partial_requests():
def
test_no_mm_input_chunking
():
# Disable multimodal input chunking.
scheduler
=
create_scheduler
(
model
=
"llava-hf/llava-1.5-7b-hf"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-1.5-7b-hf"
)
,
max_num_batched_tokens
=
1024
,
disable_chunked_mm_input
=
True
,
max_model_len
=
2048
,
...
...
@@ -347,7 +349,7 @@ def test_no_mm_input_chunking():
# of a max_num_batched_tokens for the mm input.
with
pytest
.
raises
(
ValueError
):
_
=
create_scheduler
(
model
=
"llava-hf/llava-1.5-7b-hf"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-1.5-7b-hf"
)
,
max_num_batched_tokens
=
100
,
disable_chunked_mm_input
=
True
,
)
...
...
@@ -362,7 +364,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
"""
scheduler
=
create_scheduler
(
model
=
"facebook/opt-125m"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
max_num_batched_tokens
=
1024
,
long_prefill_token_threshold
=
400
,
enable_prefix_caching
=
enable_prefix_caching
,
...
...
@@ -1241,4 +1243,4 @@ def test_memory_leak():
scheduler
.
update_from_output
(
scheduler_output
,
model_runner_output
)
# Confirm no memory leak.
assert_scheduler_empty
(
scheduler
)
assert_scheduler_empty
(
scheduler
)
\ No newline at end of file
tests/v1/core/test_scheduler_e2e.py
View file @
2e3bfb1e
...
...
@@ -4,11 +4,12 @@ import os
import
pytest
from
vllm
import
LLM
from
...utils
import
models_path_prefix
if
os
.
getenv
(
"VLLM_USE_V1"
,
"0"
)
!=
"1"
:
pytest
.
skip
(
"Test package requires V1"
,
allow_module_level
=
True
)
MODEL
=
"meta-llama/Llama-3.2-1B"
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B"
)
PROMPT
=
"Hello my name is Robert and I"
...
...
@@ -26,4 +27,4 @@ def test_concurrent_partial_prefill(model):
outputs
=
model
.
generate
([
PROMPT
]
*
3
)
assert
len
(
outputs
)
==
3
for
output
in
outputs
:
assert
len
(
output
.
outputs
)
==
1
assert
len
(
output
.
outputs
)
==
1
\ No newline at end of file
tests/v1/shutdown/__init__.py
0 → 100644
View file @
2e3bfb1e
tests/v1/shutdown/test_delete.py
View file @
2e3bfb1e
# SPDX-License-Identifier: Apache-2.0
"""Test that we handle a startup Error and shutdown."""
import
os
import
pytest
from
tests.utils
import
wait_for_gpu_memory_to_clear
...
...
@@ -11,8 +12,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from
vllm.sampling_params
import
RequestOutputKind
from
vllm.utils
import
cuda_device_count_stateless
from
vllm.v1.engine.async_llm
import
AsyncLLM
from
...utils
import
models_path_prefix
MODELS
=
[
"meta-llama/Llama-3.2-1B"
]
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B"
)
]
@
pytest
.
mark
.
asyncio
...
...
@@ -94,4 +96,4 @@ def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int,
wait_for_gpu_memory_to_clear
(
devices
=
list
(
range
(
tensor_parallel_size
)),
threshold_bytes
=
SHUTDOWN_TEST_THRESHOLD_BYTES
,
)
)
\ No newline at end of file
tests/v1/shutdown/test_forward_error.py
View file @
2e3bfb1e
...
...
@@ -3,6 +3,7 @@
import
asyncio
import
os
import
pytest
from
tests.utils
import
wait_for_gpu_memory_to_clear
...
...
@@ -14,8 +15,9 @@ from vllm.model_executor.models.llama import LlamaForCausalLM
from
vllm.utils
import
cuda_device_count_stateless
from
vllm.v1.engine.async_llm
import
AsyncLLM
from
vllm.v1.engine.exceptions
import
EngineDeadError
from
...utils
import
models_path_prefix
MODELS
=
[
"meta-llama/Llama-3.2-1B"
]
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B"
)
]
def
evil_forward
(
self
,
*
args
,
**
kwargs
):
...
...
@@ -126,4 +128,4 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size: int,
wait_for_gpu_memory_to_clear
(
devices
=
list
(
range
(
tensor_parallel_size
)),
threshold_bytes
=
SHUTDOWN_TEST_THRESHOLD_BYTES
,
)
)
\ No newline at end of file
tests/v1/shutdown/test_processor_error.py
View file @
2e3bfb1e
...
...
@@ -3,6 +3,7 @@
import
asyncio
import
os
import
pytest
from
tests.v1.shutdown.utils
import
SHUTDOWN_TEST_TIMEOUT_SEC
...
...
@@ -12,8 +13,9 @@ from vllm.inputs.data import TokensPrompt
from
vllm.sampling_params
import
RequestOutputKind
from
vllm.v1.engine.async_llm
import
AsyncLLM
from
vllm.v1.engine.exceptions
import
EngineGenerateError
from
...utils
import
models_path_prefix
MODELS
=
[
"meta-llama/Llama-3.2-1B"
]
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B"
)
]
@
pytest
.
mark
.
asyncio
...
...
@@ -66,4 +68,4 @@ async def test_async_llm_processor_error(model: str) -> None:
generated_tokens
.
extend
(
out
.
outputs
[
0
].
token_ids
)
assert
len
(
generated_tokens
)
==
EXPECTED_TOKENS
async_llm
.
shutdown
()
async_llm
.
shutdown
()
\ No newline at end of file
tests/v1/shutdown/test_startup_error.py
View file @
2e3bfb1e
# SPDX-License-Identifier: Apache-2.0
"""Test that we handle a startup Error and shutdown."""
import
os
import
pytest
from
tests.utils
import
wait_for_gpu_memory_to_clear
...
...
@@ -12,8 +13,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from
vllm.model_executor.models.llama
import
LlamaForCausalLM
from
vllm.utils
import
cuda_device_count_stateless
from
vllm.v1.engine.async_llm
import
AsyncLLM
from
...utils
import
models_path_prefix
MODELS
=
[
"meta-llama/Llama-3.2-1B"
]
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B"
)
]
def
evil_method
(
self
,
*
args
,
**
kwargs
):
...
...
@@ -69,7 +71,7 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
Test profiling (forward()) and load weights failures.
TODO(andy) - LLM without multiprocessing.
"""
if
model
!=
"meta-llama/Llama-3.2-1B"
:
if
model
!=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B"
)
:
pytest
.
skip
(
reason
=
"Only test meta-llama/Llama-3.2-1B"
)
if
cuda_device_count_stateless
()
<
tensor_parallel_size
:
pytest
.
skip
(
reason
=
"Not enough CUDA devices"
)
...
...
@@ -94,4 +96,4 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
wait_for_gpu_memory_to_clear
(
devices
=
list
(
range
(
tensor_parallel_size
)),
threshold_bytes
=
SHUTDOWN_TEST_THRESHOLD_BYTES
,
)
)
\ No newline at end of file
tests/v1/spec_decode/__init__.py
0 → 100644
View file @
2e3bfb1e
tests/v1/spec_decode/test_max_len.py
View file @
2e3bfb1e
# SPDX-License-Identifier: Apache-2.0
"""Test whether spec decoding handles the max model length properly."""
import
os
import
pytest
from
vllm
import
LLM
,
SamplingParams
from
...utils
import
models_path_prefix
_PROMPTS
=
[
"1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1"
,
...
...
@@ -21,7 +23,7 @@ def test_ngram_max_len(
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
llm
=
LLM
(
model
=
"facebook/opt-125m"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
max_model_len
=
100
,
enforce_eager
=
True
,
# For faster initialization.
speculative_config
=
{
...
...
@@ -44,11 +46,11 @@ def test_eagle_max_len(
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
llm
=
LLM
(
model
=
"meta-llama/Meta-Llama-3-8B-Instruct"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Meta-Llama-3-8B-Instruct"
)
,
enforce_eager
=
True
,
# For faster initialization.
speculative_config
=
{
"method"
:
"eagle"
,
"model"
:
"yuhuili/EAGLE-LLaMA3-Instruct-8B"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"yuhuili/EAGLE-LLaMA3-Instruct-8B"
)
,
"num_speculative_tokens"
:
num_speculative_tokens
,
},
max_model_len
=
100
,
...
...
tests/v1/spec_decode/test_ngram.py
View file @
2e3bfb1e
# SPDX-License-Identifier: Apache-2.0
import
os
import
numpy
as
np
from
vllm.config
import
ModelConfig
,
SpeculativeConfig
,
VllmConfig
from
vllm.v1.spec_decode.ngram_proposer
import
(
NgramProposer
,
_find_subarray_kmp
,
_kmp_lps_array
)
from
...utils
import
models_path_prefix
def
test_kmp_lps_array
():
...
...
@@ -43,10 +45,10 @@ def test_ngram_proposer():
def
ngram_proposer
(
min_n
:
int
,
max_n
:
int
,
k
:
int
)
->
NgramProposer
:
# Dummy model config. Just to set max_model_len.
model_config
=
ModelConfig
(
model
=
"facebook/opt-125m"
,
model_config
=
ModelConfig
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
task
=
"generate"
,
max_model_len
=
100
,
tokenizer
=
"facebook/opt-125m"
,
tokenizer
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
tokenizer_mode
=
"auto"
,
dtype
=
"auto"
,
seed
=
None
,
...
...
@@ -86,4 +88,4 @@ def test_ngram_proposer():
result
=
ngram_proposer
(
2
,
4
,
2
).
propose
(
context_token_ids
=
np
.
array
([
3
,
4
,
5
,
2
,
3
,
4
,
1
,
2
,
3
,
4
]))
assert
np
.
array_equal
(
result
,
np
.
array
([
1
,
2
]))
# Not [5, 2]
assert
np
.
array_equal
(
result
,
np
.
array
([
1
,
2
]))
# Not [5, 2]
\ No newline at end of file
tests/v1/test_async_llm_dp.py
View file @
2e3bfb1e
...
...
@@ -14,9 +14,10 @@ from vllm.platforms import current_platform
from
vllm.sampling_params
import
RequestOutputKind
from
vllm.v1.engine.async_llm
import
AsyncLLM
from
vllm.v1.engine.core_client
import
DPAsyncMPClient
from
..utils
import
models_path_prefix
engine_args
=
AsyncEngineArgs
(
model
=
"ibm-research/PowerMoE-3b"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"ibm-research/PowerMoE-3b"
)
,
enforce_eager
=
True
,
disable_log_requests
=
True
,
tensor_parallel_size
=
int
(
os
.
getenv
(
"TP_SIZE"
,
1
)),
...
...
@@ -106,4 +107,4 @@ async def test_load(output_kind: RequestOutputKind):
await
asyncio
.
sleep
(
0.5
)
assert
not
core_client
.
engines_running
assert
not
core_client
.
reqs_in_flight
assert
not
core_client
.
reqs_in_flight
\ No newline at end of file
tests/v1/test_oracle.py
View file @
2e3bfb1e
...
...
@@ -7,16 +7,17 @@ import vllm.envs as envs
from
vllm
import
LLM
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
..utils
import
models_path_prefix
UNSUPPORTED_MODELS_V1
=
[
"openai/whisper-large-v3"
,
# transcription
"facebook/bart-large-cnn"
,
# encoder decoder
"mistralai/Mamba-Codestral-7B-v0.1"
,
# mamba
"ibm-ai-platform/Bamba-9B"
,
# hybrid
"BAAI/bge-m3"
,
# embedding
os
.
path
.
join
(
models_path_prefix
,
"openai/whisper-large-v3"
)
,
# transcription
os
.
path
.
join
(
models_path_prefix
,
"facebook/bart-large-cnn"
)
,
# encoder decoder
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Mamba-Codestral-7B-v0.1"
)
,
# mamba
os
.
path
.
join
(
models_path_prefix
,
"ibm-ai-platform/Bamba-9B"
)
,
# hybrid
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-m3"
)
,
# embedding
]
MODEL
=
"meta-llama/Llama-3.2-1B-Instruct"
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
UNSUPPORTED_MODELS_V1
)
...
...
@@ -160,4 +161,4 @@ def test_reject_using_constructor_directly(monkeypatch):
AsyncLLMEngine
.
_get_executor_cls
(
vllm_config
),
log_stats
=
True
)
m
.
delenv
(
"VLLM_USE_V1"
)
m
.
delenv
(
"VLLM_USE_V1"
)
\ No newline at end of file
tests/v1/worker/test_gpu_model_runner.py
View file @
2e3bfb1e
# SPDX-License-Identifier: Apache-2.0
import
os
import
pytest
from
vllm.config
import
CacheConfig
,
ModelConfig
,
SchedulerConfig
,
VllmConfig
...
...
@@ -7,6 +8,7 @@ from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
SchedulerOutput
)
from
vllm.v1.sample.metadata
import
SamplingMetadata
from
vllm.v1.worker.gpu_model_runner
import
GPUModelRunner
from
...utils
import
models_path_prefix
@
pytest
.
fixture
...
...
@@ -17,9 +19,9 @@ def model_runner():
max_model_len
=
512
,
)
model_config
=
ModelConfig
(
model
=
"facebook/opt-125m"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
task
=
"generate"
,
tokenizer
=
"facebook/opt-125m"
,
tokenizer
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
True
,
dtype
=
"float16"
,
...
...
@@ -276,4 +278,4 @@ def test_update_states_request_unscheduled(model_runner):
assert
_is_req_scheduled
(
model_runner
,
req_ids
[
0
])
assert
_is_req_added
(
model_runner
,
req_ids
[
1
])
assert
not
_is_req_scheduled
(
model_runner
,
req_ids
[
1
])
assert
not
_is_req_scheduled
(
model_runner
,
req_ids
[
1
])
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment