Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ced28510
"docs/vscode:/vscode.git/clone" did not exist on "70b1b330e10f5eba8bf003500834d214c8b4a559"
Commit
ced28510
authored
Jun 03, 2025
by
zhuwenwen
Browse files
[tests] fix tests of core, engine and detokenizer
parent
734a433d
Changes
14
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
95 additions
and
60 deletions
+95
-60
tests/compile/test_basic_correctness.py
tests/compile/test_basic_correctness.py
+23
-23
tests/compile/untest_functionalization.py
tests/compile/untest_functionalization.py
+0
-0
tests/compile/untest_fusion.py
tests/compile/untest_fusion.py
+0
-0
tests/core/block/e2e/test_correctness.py
tests/core/block/e2e/test_correctness.py
+17
-15
tests/core/test_chunked_prefill_scheduler.py
tests/core/test_chunked_prefill_scheduler.py
+7
-2
tests/core/test_num_computed_tokens_update.py
tests/core/test_num_computed_tokens_update.py
+5
-2
tests/detokenizer/test_disable_detokenization.py
tests/detokenizer/test_disable_detokenization.py
+7
-3
tests/detokenizer/test_stop_strings.py
tests/detokenizer/test_stop_strings.py
+4
-2
tests/engine/test_computed_prefix_blocks.py
tests/engine/test_computed_prefix_blocks.py
+7
-3
tests/engine/test_executor.py
tests/engine/test_executor.py
+8
-3
tests/engine/test_short_mm_context.py
tests/engine/test_short_mm_context.py
+2
-1
tests/engine/test_skip_tokenizer_init.py
tests/engine/test_skip_tokenizer_init.py
+7
-2
tests/fastsafetensors_loader/test_fastsafetensors_loader.py
tests/fastsafetensors_loader/test_fastsafetensors_loader.py
+4
-2
tests/fastsafetensors_loader/test_weight_utils.py
tests/fastsafetensors_loader/test_weight_utils.py
+4
-2
No files found.
tests/compile/test_basic_correctness.py
View file @
ced28510
...
@@ -29,18 +29,18 @@ class TestSetting:
...
@@ -29,18 +29,18 @@ class TestSetting:
"test_setting"
,
"test_setting"
,
[
[
# basic llama model
# basic llama model
TestSetting
(
#
TestSetting(
model
=
"meta-llama/Llama-3.2-1B-Instruct"
,
#
model=
os.path.join(models_path_prefix,
"meta-llama/Llama-3.2-1B-Instruct"
)
,
model_args
=
[],
#
model_args=[],
pp_size
=
2
,
#
pp_size=2,
tp_size
=
2
,
#
tp_size=2,
attn_backend
=
"FLASHINFER"
,
#
attn_backend="FLASHINFER",
method
=
"generate"
,
#
method="generate",
fullgraph
=
True
,
#
fullgraph=True,
),
#
),
# llama model with quantization
# llama model with quantization
TestSetting
(
TestSetting
(
model
=
"TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"
)
,
model_args
=
[
"--quantization"
,
"gptq"
],
model_args
=
[
"--quantization"
,
"gptq"
],
pp_size
=
1
,
pp_size
=
1
,
tp_size
=
1
,
tp_size
=
1
,
...
@@ -50,7 +50,7 @@ class TestSetting:
...
@@ -50,7 +50,7 @@ class TestSetting:
),
),
# MoE model
# MoE model
TestSetting
(
TestSetting
(
model
=
"ibm/PowerMoE-3b"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"ibm/PowerMoE-3b"
)
,
model_args
=
[],
model_args
=
[],
pp_size
=
1
,
pp_size
=
1
,
tp_size
=
2
,
tp_size
=
2
,
...
@@ -60,7 +60,7 @@ class TestSetting:
...
@@ -60,7 +60,7 @@ class TestSetting:
),
),
# embedding model
# embedding model
TestSetting
(
TestSetting
(
model
=
"BAAI/bge-multilingual-gemma2"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-multilingual-gemma2"
)
,
model_args
=
[
"--task"
,
"embed"
,
"--dtype"
,
"bfloat16"
],
model_args
=
[
"--task"
,
"embed"
,
"--dtype"
,
"bfloat16"
],
pp_size
=
1
,
pp_size
=
1
,
tp_size
=
1
,
tp_size
=
1
,
...
@@ -69,18 +69,18 @@ class TestSetting:
...
@@ -69,18 +69,18 @@ class TestSetting:
fullgraph
=
True
,
fullgraph
=
True
,
),
),
# encoder-based embedding model (BERT)
# encoder-based embedding model (BERT)
TestSetting
(
#
TestSetting(
model
=
"BAAI/bge-base-en-v1.5"
,
#
model=
os.path.join(models_path_prefix,
"BAAI/bge-base-en-v1.5"
)
,
model_args
=
[
"--task"
,
"embed"
],
#
model_args=["--task", "embed"],
pp_size
=
1
,
#
pp_size=1,
tp_size
=
1
,
#
tp_size=1,
attn_backend
=
"XFORMERS"
,
#
attn_backend="XFORMERS",
method
=
"encode"
,
#
method="encode",
fullgraph
=
True
,
#
fullgraph=True,
),
#
),
# vision language model
# vision language model
TestSetting
(
TestSetting
(
model
=
"microsoft/Phi-3.5-vision-instruct"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-vision-instruct"
)
,
model_args
=
[
"--trust-remote-code"
,
"--max-model-len"
,
"2048"
],
model_args
=
[
"--trust-remote-code"
,
"--max-model-len"
,
"2048"
],
pp_size
=
2
,
pp_size
=
2
,
tp_size
=
1
,
tp_size
=
1
,
...
...
tests/compile/test_functionalization.py
→
tests/compile/
un
test_functionalization.py
View file @
ced28510
File moved
tests/compile/test_fusion.py
→
tests/compile/
un
test_fusion.py
View file @
ced28510
File moved
tests/core/block/e2e/test_correctness.py
View file @
ced28510
...
@@ -9,6 +9,8 @@ from vllm import SamplingParams
...
@@ -9,6 +9,8 @@ from vllm import SamplingParams
from
.conftest
import
get_token_ids_from_llm_generator
from
.conftest
import
get_token_ids_from_llm_generator
import
os
import
os
from
....utils
import
models_path_prefix
from
....utils
import
models_path_prefix
import
vllm.envs
as
envs
from
vllm.utils
import
SUPPORT_TC
,
gpuname
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
...
@@ -21,7 +23,7 @@ from ....utils import models_path_prefix
...
@@ -21,7 +23,7 @@ from ....utils import models_path_prefix
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# Allow only 5 sequences of ~1024 tokens in worst case.
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size"
:
16
,
"block_size"
:
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
"num_gpu_blocks_override"
:
5
*
(
64
+
1
),
"num_gpu_blocks_override"
:
5
*
(
64
+
1
),
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
...
@@ -104,19 +106,19 @@ def test_block_manager_with_preemption(baseline_llm_generator,
...
@@ -104,19 +106,19 @@ def test_block_manager_with_preemption(baseline_llm_generator,
"per_test_common_llm_kwargs"
,
"per_test_common_llm_kwargs"
,
[
[
{
{
"block_size"
:
16
,
"block_size"
:
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
# Allow only 2 sequences of ~128 tokens in worst case.
# Allow only 2 sequences of ~128 tokens in worst case.
# Note 8 = 128/block_size
# Note 8 = 128/block_size
"num_gpu_blocks_override"
:
2
*
(
8
+
1
),
"num_gpu_blocks_override"
:
2
*
(
8
+
1
),
},
},
{
# {
"block_size"
:
8
,
#
"block_size": 8,
# Allow only 2 sequences of ~128 tokens in worst case.
#
# Allow only 2 sequences of ~128 tokens in worst case.
# Note 16 = 128/block_size
#
# Note 16 = 128/block_size
"num_gpu_blocks_override"
:
2
*
(
16
+
2
),
#
"num_gpu_blocks_override": 2 * (16 + 2),
}
#
}
])
])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
"num_lookahead_slots"
:
0
,
"num_lookahead_slots"
:
0
,
...
@@ -197,15 +199,15 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
...
@@ -197,15 +199,15 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
])
])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{
[{
"block_size"
:
16
,
"block_size"
:
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
"max_num_batched_tokens"
:
2
,
"max_num_batched_tokens"
:
2
,
"max_num_seqs"
:
2
,
"max_num_seqs"
:
2
,
},
{
},
{
"block_size"
:
16
,
"block_size"
:
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
"max_num_batched_tokens"
:
3
,
"max_num_batched_tokens"
:
3
,
"max_num_seqs"
:
2
,
"max_num_seqs"
:
2
,
},
{
},
{
"block_size"
:
16
,
"block_size"
:
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
"max_num_batched_tokens"
:
256
,
"max_num_batched_tokens"
:
256
,
"max_num_seqs"
:
10
,
"max_num_seqs"
:
10
,
}])
}])
...
@@ -271,7 +273,7 @@ def test_chunked_prefill_block_manager(baseline_llm_generator,
...
@@ -271,7 +273,7 @@ def test_chunked_prefill_block_manager(baseline_llm_generator,
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# Allow only 5 sequences of ~1024 tokens in worst case.
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size"
:
16
,
"block_size"
:
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
"num_gpu_blocks_override"
:
5
*
(
64
+
1
),
"num_gpu_blocks_override"
:
5
*
(
64
+
1
),
# Enable prefill cache
# Enable prefill cache
...
@@ -352,7 +354,7 @@ def test_block_manager_prefix_caching_enabled_with_preemption(
...
@@ -352,7 +354,7 @@ def test_block_manager_prefix_caching_enabled_with_preemption(
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# Allow only 5 sequences of ~1024 tokens in worst case.
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size"
:
16
,
"block_size"
:
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
"num_gpu_blocks_override"
:
5
*
(
64
+
1
),
"num_gpu_blocks_override"
:
5
*
(
64
+
1
),
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
...
@@ -427,7 +429,7 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
...
@@ -427,7 +429,7 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
# we keep the blocks small, so that hit eviction quickly
# we keep the blocks small, so that hit eviction quickly
"max_model_len"
:
48
,
"max_model_len"
:
48
,
"block_size"
:
16
,
"block_size"
:
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
"num_gpu_blocks_override"
:
3
,
"num_gpu_blocks_override"
:
3
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
...
...
tests/core/test_chunked_prefill_scheduler.py
View file @
ced28510
...
@@ -2,6 +2,7 @@
...
@@ -2,6 +2,7 @@
from
unittest.mock
import
MagicMock
from
unittest.mock
import
MagicMock
import
os
import
pytest
# noqa
import
pytest
# noqa
from
vllm.config
import
CacheConfig
,
SchedulerConfig
from
vllm.config
import
CacheConfig
,
SchedulerConfig
...
@@ -12,6 +13,9 @@ from vllm.sampling_params import SamplingParams
...
@@ -12,6 +13,9 @@ from vllm.sampling_params import SamplingParams
from
vllm.sequence
import
Logprob
,
SequenceGroup
from
vllm.sequence
import
Logprob
,
SequenceGroup
from
.utils
import
create_dummy_prompt
from
.utils
import
create_dummy_prompt
from
..utils
import
models_path_prefix
from
vllm.utils
import
SUPPORT_TC
,
gpuname
import
vllm.envs
as
envs
def
get_sequence_groups
(
scheduler_output
):
def
get_sequence_groups
(
scheduler_output
):
...
@@ -830,7 +834,7 @@ def test_prefix_caching_with_concurrent_partial_prefills():
...
@@ -830,7 +834,7 @@ def test_prefix_caching_with_concurrent_partial_prefills():
assert
out
.
num_batched_tokens
==
44
assert
out
.
num_batched_tokens
==
44
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
])
@
pytest
.
mark
.
parametrize
(
"max_num_partial_prefills"
,
[
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"max_num_partial_prefills"
,
[
2
,
4
,
8
])
def
test_chunked_prefill_with_actual_engine
(
model
:
str
,
def
test_chunked_prefill_with_actual_engine
(
model
:
str
,
max_num_partial_prefills
:
int
):
max_num_partial_prefills
:
int
):
...
@@ -847,6 +851,7 @@ def test_chunked_prefill_with_actual_engine(model: str,
...
@@ -847,6 +851,7 @@ def test_chunked_prefill_with_actual_engine(model: str,
max_num_seqs
=
8
,
max_num_seqs
=
8
,
enable_chunked_prefill
=
True
,
enable_chunked_prefill
=
True
,
gpu_memory_utilization
=
0.8
,
gpu_memory_utilization
=
0.8
,
block_size
=
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
)
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
...
...
tests/core/test_num_computed_tokens_update.py
View file @
ced28510
...
@@ -9,6 +9,8 @@ from vllm.engine.llm_engine import LLMEngine
...
@@ -9,6 +9,8 @@ from vllm.engine.llm_engine import LLMEngine
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
SequenceGroup
from
vllm.sequence
import
SequenceGroup
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
from
vllm.utils
import
SUPPORT_TC
,
gpuname
import
vllm.envs
as
envs
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-160m"
)
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-160m"
)
...
@@ -37,7 +39,8 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
...
@@ -37,7 +39,8 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
gpu_memory_utilization
=
0.7
,
gpu_memory_utilization
=
0.7
,
num_scheduler_steps
=
num_scheduler_steps
,
num_scheduler_steps
=
num_scheduler_steps
,
enable_chunked_prefill
=
enable_chunked_prefill
,
enable_chunked_prefill
=
enable_chunked_prefill
,
enforce_eager
=
enforce_eager
)
enforce_eager
=
enforce_eager
,
block_size
=
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
)
engine
:
LLMEngine
=
runner
.
model
.
llm_engine
engine
:
LLMEngine
=
runner
.
model
.
llm_engine
# In multi-step + chunked-prefill there is no separate single prompt step.
# In multi-step + chunked-prefill there is no separate single prompt step.
...
...
tests/detokenizer/test_disable_detokenization.py
View file @
ced28510
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
os
import
pytest
import
pytest
from
vllm.entrypoints.llm
import
LLM
from
vllm.entrypoints.llm
import
LLM
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
from
..utils
import
models_path_prefix
import
vllm.envs
as
envs
from
vllm.utils
import
SUPPORT_TC
,
gpuname
@
pytest
.
mark
.
skip_v1
@
pytest
.
mark
.
skip_v1
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"distilbert/distilgpt2"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
)
])
def
test_computed_prefix_blocks
(
model
:
str
):
def
test_computed_prefix_blocks
(
model
:
str
):
# This test checks if the engine generates completions both with and
# This test checks if the engine generates completions both with and
# without optional detokenization, that detokenization includes text
# without optional detokenization, that detokenization includes text
...
@@ -18,7 +22,7 @@ def test_computed_prefix_blocks(model: str):
...
@@ -18,7 +22,7 @@ def test_computed_prefix_blocks(model: str):
"paper clips? Is there an easy to follow video tutorial available "
"paper clips? Is there an easy to follow video tutorial available "
"online for free?"
)
"online for free?"
)
llm
=
LLM
(
model
=
model
)
llm
=
LLM
(
model
=
model
,
block_size
=
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
)
sampling_params
=
SamplingParams
(
max_tokens
=
10
,
sampling_params
=
SamplingParams
(
max_tokens
=
10
,
temperature
=
0.0
,
temperature
=
0.0
,
detokenize
=
False
)
detokenize
=
False
)
...
...
tests/detokenizer/test_stop_strings.py
View file @
ced28510
...
@@ -2,11 +2,13 @@
...
@@ -2,11 +2,13 @@
from
typing
import
Any
,
Optional
from
typing
import
Any
,
Optional
import
os
import
pytest
import
pytest
from
vllm
import
LLM
,
SamplingParams
,
envs
from
vllm
import
LLM
,
SamplingParams
,
envs
from
..utils
import
models_path_prefix
MODEL
=
"meta-llama/llama-2-7b-hf"
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/llama-2-7b-hf"
)
MAX_TOKENS
=
200
MAX_TOKENS
=
200
...
...
tests/engine/test_computed_prefix_blocks.py
View file @
ced28510
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
os
import
pytest
import
pytest
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
from
..utils
import
models_path_prefix
from
vllm.utils
import
SUPPORT_TC
,
gpuname
import
vllm.envs
as
envs
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"distilbert/distilgpt2"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
)
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
64
]
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
[
16
])
def
test_computed_prefix_blocks
(
model
:
str
,
block_size
:
int
):
def
test_computed_prefix_blocks
(
model
:
str
,
block_size
:
int
):
# This test checks if we are able to run the engine to completion
# This test checks if we are able to run the engine to completion
# without triggering asserts.
# without triggering asserts.
...
...
tests/engine/test_executor.py
View file @
ced28510
...
@@ -13,6 +13,8 @@ from vllm.executor.uniproc_executor import UniProcExecutor
...
@@ -13,6 +13,8 @@ from vllm.executor.uniproc_executor import UniProcExecutor
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
import
os
import
os
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
from
vllm.utils
import
SUPPORT_TC
,
gpuname
import
vllm.envs
as
envs
class
Mock
:
class
Mock
:
...
@@ -57,6 +59,7 @@ def test_custom_executor(model, tmp_path):
...
@@ -57,6 +59,7 @@ def test_custom_executor(model, tmp_path):
model
=
model
,
model
=
model
,
distributed_executor_backend
=
CustomUniExecutor
,
distributed_executor_backend
=
CustomUniExecutor
,
enforce_eager
=
True
,
# reduce test time
enforce_eager
=
True
,
# reduce test time
block_size
=
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
)
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
...
@@ -69,7 +72,7 @@ def test_custom_executor(model, tmp_path):
...
@@ -69,7 +72,7 @@ def test_custom_executor(model, tmp_path):
os
.
chdir
(
cwd
)
os
.
chdir
(
cwd
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"distilbert/distilgpt2"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
)
])
def
test_custom_executor_async
(
model
,
tmp_path
):
def
test_custom_executor_async
(
model
,
tmp_path
):
cwd
=
os
.
path
.
abspath
(
"."
)
cwd
=
os
.
path
.
abspath
(
"."
)
os
.
chdir
(
tmp_path
)
os
.
chdir
(
tmp_path
)
...
@@ -80,6 +83,7 @@ def test_custom_executor_async(model, tmp_path):
...
@@ -80,6 +83,7 @@ def test_custom_executor_async(model, tmp_path):
model
=
model
,
model
=
model
,
distributed_executor_backend
=
CustomUniExecutorAsync
,
distributed_executor_backend
=
CustomUniExecutorAsync
,
enforce_eager
=
True
,
# reduce test time
enforce_eager
=
True
,
# reduce test time
block_size
=
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
)
)
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
...
@@ -96,7 +100,7 @@ def test_custom_executor_async(model, tmp_path):
...
@@ -96,7 +100,7 @@ def test_custom_executor_async(model, tmp_path):
os
.
chdir
(
cwd
)
os
.
chdir
(
cwd
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"distilbert/distilgpt2"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
)
])
def
test_respect_ray
(
model
):
def
test_respect_ray
(
model
):
# even for TP=1 and PP=1,
# even for TP=1 and PP=1,
# if users specify ray, we should use ray.
# if users specify ray, we should use ray.
...
@@ -106,6 +110,7 @@ def test_respect_ray(model):
...
@@ -106,6 +110,7 @@ def test_respect_ray(model):
model
=
model
,
model
=
model
,
distributed_executor_backend
=
"ray"
,
distributed_executor_backend
=
"ray"
,
enforce_eager
=
True
,
# reduce test time
enforce_eager
=
True
,
# reduce test time
block_size
=
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
)
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
assert
engine
.
model_executor
.
uses_ray
assert
engine
.
model_executor
.
uses_ray
\ No newline at end of file
tests/engine/test_short_mm_context.py
View file @
ced28510
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
os
import
pytest
import
pytest
from
..conftest
import
IMAGE_ASSETS
from
..conftest
import
IMAGE_ASSETS
...
...
tests/engine/test_skip_tokenizer_init.py
View file @
ced28510
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
os
import
pytest
import
pytest
from
vllm.entrypoints.llm
import
LLM
from
vllm.entrypoints.llm
import
LLM
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
from
..utils
import
models_path_prefix
from
vllm.utils
import
SUPPORT_TC
,
gpuname
import
vllm.envs
as
envs
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"distilbert/distilgpt2"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
)
])
def
test_skip_tokenizer_initialization
(
model
:
str
):
def
test_skip_tokenizer_initialization
(
model
:
str
):
# This test checks if the flag skip_tokenizer_init skips the initialization
# This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain
# of tokenizer and detokenizer. The generated output is expected to contain
...
@@ -14,6 +18,7 @@ def test_skip_tokenizer_initialization(model: str):
...
@@ -14,6 +18,7 @@ def test_skip_tokenizer_initialization(model: str):
llm
=
LLM
(
llm
=
LLM
(
model
=
model
,
model
=
model
,
skip_tokenizer_init
=
True
,
skip_tokenizer_init
=
True
,
block_size
=
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
)
)
sampling_params
=
SamplingParams
(
prompt_logprobs
=
True
,
detokenize
=
True
)
sampling_params
=
SamplingParams
(
prompt_logprobs
=
True
,
detokenize
=
True
)
...
...
tests/fastsafetensors_loader/test_fastsafetensors_loader.py
View file @
ced28510
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
os
from
vllm
import
SamplingParams
from
vllm
import
SamplingParams
from
vllm.config
import
LoadFormat
from
vllm.config
import
LoadFormat
from
..utils
import
models_path_prefix
test_model
=
"openai-community/gpt2"
test_model
=
os
.
path
.
join
(
models_path_prefix
,
"openai-community/gpt2"
)
prompts
=
[
prompts
=
[
"Hello, my name is"
,
"Hello, my name is"
,
...
...
tests/fastsafetensors_loader/test_weight_utils.py
View file @
ced28510
...
@@ -2,9 +2,11 @@
...
@@ -2,9 +2,11 @@
import
glob
import
glob
import
tempfile
import
tempfile
import
os
import
huggingface_hub.constants
import
huggingface_hub.constants
import
torch
import
torch
from
..utils
import
models_path_prefix
from
vllm.model_executor.model_loader.weight_utils
import
(
from
vllm.model_executor.model_loader.weight_utils
import
(
download_weights_from_hf
,
fastsafetensors_weights_iterator
,
download_weights_from_hf
,
fastsafetensors_weights_iterator
,
...
@@ -14,7 +16,7 @@ from vllm.model_executor.model_loader.weight_utils import (
...
@@ -14,7 +16,7 @@ from vllm.model_executor.model_loader.weight_utils import (
def
test_fastsafetensors_model_loader
():
def
test_fastsafetensors_model_loader
():
with
tempfile
.
TemporaryDirectory
()
as
tmpdir
:
with
tempfile
.
TemporaryDirectory
()
as
tmpdir
:
huggingface_hub
.
constants
.
HF_HUB_OFFLINE
=
False
huggingface_hub
.
constants
.
HF_HUB_OFFLINE
=
False
download_weights_from_hf
(
"openai-community/gpt2"
,
download_weights_from_hf
(
os
.
path
.
join
(
models_path_prefix
,
"openai-community/gpt2"
)
,
allow_patterns
=
[
"*.safetensors"
],
allow_patterns
=
[
"*.safetensors"
],
cache_dir
=
tmpdir
)
cache_dir
=
tmpdir
)
safetensors
=
glob
.
glob
(
f
"
{
tmpdir
}
/**/*.safetensors"
,
recursive
=
True
)
safetensors
=
glob
.
glob
(
f
"
{
tmpdir
}
/**/*.safetensors"
,
recursive
=
True
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment