Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
bd363067
Commit
bd363067
authored
Jun 05, 2025
by
lizhigong
Browse files
Merge branch 'v0.8.5.post1-dev' into v0.8.5-zero_overhead
parents
87ef4618
d36deb1a
Changes
106
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
197 additions
and
170 deletions
+197
-170
tests/samplers/test_beam_search.py
tests/samplers/test_beam_search.py
+3
-3
tests/spec_decode/e2e/test_compatibility.py
tests/spec_decode/e2e/test_compatibility.py
+4
-4
tests/spec_decode/e2e/test_eagle_correctness.py
tests/spec_decode/e2e/test_eagle_correctness.py
+6
-6
tests/spec_decode/e2e/test_integration.py
tests/spec_decode/e2e/test_integration.py
+5
-5
tests/spec_decode/e2e/test_mtp_correctness.py
tests/spec_decode/e2e/test_mtp_correctness.py
+4
-2
tests/spec_decode/e2e/test_ngram_correctness.py
tests/spec_decode/e2e/test_ngram_correctness.py
+2
-2
tests/spec_decode/test_multi_step_worker.py
tests/spec_decode/test_multi_step_worker.py
+101
-101
tests/spec_decode/test_spec_decode_worker.py
tests/spec_decode/test_spec_decode_worker.py
+5
-3
tests/tensorizer_loader/test_tensorizer.py
tests/tensorizer_loader/test_tensorizer.py
+4
-2
tests/test_config.py
tests/test_config.py
+10
-10
tests/test_regression.py
tests/test_regression.py
+20
-9
tests/test_sampling_params.py
tests/test_sampling_params.py
+3
-1
tests/test_utils.py
tests/test_utils.py
+1
-0
tests/tool_use/utils.py
tests/tool_use/utils.py
+9
-9
tests/tpu/untest_quantization_accuracy.py
tests/tpu/untest_quantization_accuracy.py
+4
-2
tests/v1/core/__init__.py
tests/v1/core/__init__.py
+0
-0
tests/v1/core/test_kv_cache_utils.py
tests/v1/core/test_kv_cache_utils.py
+4
-2
tests/v1/core/test_scheduler.py
tests/v1/core/test_scheduler.py
+9
-7
tests/v1/core/test_scheduler_e2e.py
tests/v1/core/test_scheduler_e2e.py
+3
-2
tests/v1/shutdown/__init__.py
tests/v1/shutdown/__init__.py
+0
-0
No files found.
tests/samplers/test_beam_search.py
View file @
bd363067
...
...
@@ -6,10 +6,10 @@ Run `pytest tests/samplers/test_beam_search.py`.
import
pytest
import
os
from
..utils
import
models_path_prefix
from
transformers
import
AutoModelForSeq2SeqLM
from
vllm.assets.audio
import
AudioAsset
from
..utils
import
models_path_prefix
@
pytest
.
fixture
(
autouse
=
True
)
...
...
@@ -83,7 +83,7 @@ def test_beam_search_passes_multimodal_data(
# correctly. As such, we just need to check one extra modality to make
# sure things pass through properly.
audios
=
[
AudioAsset
(
"mary_had_lamb"
).
audio_and_sample_rate
]
model
=
"Qwen/Qwen2-Audio-7B-Instruct"
model
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-Audio-7B-Instruct"
)
audio_seq
=
"<|audio_bos|><|AUDIO|><|audio_eos|>"
prompts
=
[
f
"<|im_start|>user
\n
{
audio_seq
}
Can you transcribe this?<|im_end|>
\n
<|im_start|>assistant
\n
"
#noqa: E501
...
...
tests/spec_decode/e2e/test_compatibility.py
View file @
bd363067
...
...
@@ -19,7 +19,7 @@ from ...utils import models_path_prefix
{
# Speculative max model len > overridden max model len should raise.
"speculative_config"
:
{
"model"
:
"JackFram/llama-68m"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
,
"num_speculative_tokens"
:
5
,
"max_model_len"
:
129
,
},
...
...
@@ -29,7 +29,7 @@ from ...utils import models_path_prefix
# Speculative max model len > draft max model len should raise.
# https://huggingface.co/JackFram/llama-68m/blob/3b606af5198a0b26762d589a3ee3d26ee6fa6c85/config.json#L12
"speculative_config"
:
{
"model"
:
"JackFram/llama-68m"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
,
"num_speculative_tokens"
:
5
,
"max_model_len"
:
2048
+
1
,
},
...
...
@@ -38,7 +38,7 @@ from ...utils import models_path_prefix
# Speculative max model len > target max model len should raise.
# https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
"speculative_config"
:
{
"model"
:
"JackFram/llama-68m"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
,
"num_speculative_tokens"
:
5
,
"max_model_len"
:
131072
+
1
,
},
...
...
tests/spec_decode/e2e/test_eagle_correctness.py
View file @
bd363067
...
...
@@ -332,14 +332,14 @@ def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
"dtype"
:
"float16"
,
# Main model
"model_name"
:
"meta-llama/Llama-2-7b-chat-hf"
,
"model_name"
:
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-2-7b-chat-hf"
)
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
"speculative_config"
:
{
"model"
:
"yuhuili/EAGLE-llama2-chat-7B"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"yuhuili/EAGLE-llama2-chat-7B"
)
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
},
},
...
...
@@ -382,14 +382,14 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
"dtype"
:
"float16"
,
# Main model
"model_name"
:
"meta-llama/Meta-Llama-3-8B-Instruct"
,
"model_name"
:
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Meta-Llama-3-8B-Instruct"
)
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
"speculative_config"
:
{
"model"
:
"yuhuili/EAGLE-LLaMA3-Instruct-8B"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"yuhuili/EAGLE-LLaMA3-Instruct-8B"
)
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
},
},
...
...
@@ -432,14 +432,14 @@ def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
"dtype"
:
"float16"
,
# Main model
"model_name"
:
"Qwen/Qwen2-7B-Instruct"
,
"model_name"
:
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-7B-Instruct"
)
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
"speculative_config"
:
{
"model"
:
"yuhuili/EAGLE-Qwen2-7B-Instruct"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"yuhuili/EAGLE-Qwen2-7B-Instruct"
)
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
},
},
...
...
tests/spec_decode/e2e/test_integration.py
View file @
bd363067
...
...
@@ -69,7 +69,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Explicitly specify draft model quantization
{
"speculative_config"
:
{
"model"
:
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
)
,
"num_speculative_tokens"
:
5
,
"quantization"
:
"gptq"
,
},
...
...
@@ -77,7 +77,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Explicitly specify GPTQ-based draft model to use marlin quantization
{
"speculative_config"
:
{
"model"
:
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
)
,
"num_speculative_tokens"
:
5
,
"quantization"
:
"marlin"
,
},
...
...
@@ -85,7 +85,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Not explicitly specify draft model quantization
{
"speculative_config"
:
{
"model"
:
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
)
,
"num_speculative_tokens"
:
5
,
"quantization"
:
None
,
},
...
...
@@ -124,7 +124,7 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"speculative_config"
:
{
"model"
:
"JackFram/llama-68m"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
,
"num_speculative_tokens"
:
3
,
"disable_mqa_scorer"
:
True
,
},
...
...
tests/spec_decode/e2e/test_mtp_correctness.py
View file @
bd363067
...
...
@@ -20,12 +20,14 @@ With those tests, we can say at least, mtp would not break the
correctess for the target model outputs.
"""
import
os
import
pytest
from
.conftest
import
run_equality_correctness_test
from
...utils
import
models_path_prefix
# main model
MAIN_MODEL
=
"luccafong/deepseek_mtp_main_random"
MAIN_MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"luccafong/deepseek_mtp_main_random"
)
# max. number of speculative tokens: this corresponds to
# num_nextn_predict_layers in the config.json of the speculator model.
...
...
tests/spec_decode/e2e/test_ngram_correctness.py
View file @
bd363067
...
...
@@ -334,7 +334,7 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"model_name"
:
"JackFram/llama-68m"
,
"model_name"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
...
...
tests/spec_decode/test_multi_step_worker.py
View file @
bd363067
...
...
@@ -484,107 +484,107 @@ def test_multi_step_with_batch_expansion_incorrect_output():
assert
(
num_mismatch
>
0
)
@
torch
.
inference_mode
()
@
pytest
.
mark
.
parametrize
(
'num_steps'
,
[
1
,
2
,
3
,
4
])
# The choice of backends forces the multi_step_worker to choose between
# the vanilla model_runner and TP1DraftModelRunner and that we can test
# both code paths.
@
pytest
.
mark
.
parametrize
(
'attn_backend'
,
[
_Backend
.
XFORMERS
,
_Backend
.
FLASH_ATTN
])
def
test_multi_step_correct_kvcache
(
num_steps
,
attn_backend
):
"""Verify that the KV cache of the draft model
is correctly updated for sequences with bonus token.
"""
seed
=
100
model_name
=
"JackFram/llama-68m"
block_size
=
16
num_gpu_blocks
=
2048
//
block_size
batch_size
=
1
with
global_force_attn_backend_context_manager
(
attn_backend
):
dtype
=
'float16'
if
attn_backend
==
_Backend
.
FLASH_ATTN
else
'float32'
multi_step_worker
=
create_worker
(
MultiStepWorker
,
model_name
,
block_size
,
num_gpu_blocks
,
seed
,
model_runner_cls
=
TP1DraftModelRunner
,
dtype
=
dtype
)
multi_step_worker
.
set_include_gpu_probs_tensor
()
worker
=
create_worker
(
Worker
,
model_name
,
block_size
,
num_gpu_blocks
,
seed
,
dtype
=
dtype
)
prompts
=
[[
0
]
for
_
in
range
(
batch_size
)]
# Already generate two tokens for the sequence
# so that we can simulate the bonus token case
multi_step_continuations
=
[[
random
.
randint
(
0
,
1000
),
random
.
randint
(
0
,
1000
)
]
for
_
in
prompts
]
final_prompt_lens
=
[
len
(
prompt
)
+
2
+
num_steps
for
prompt
in
prompts
]
seq_ids_with_bonus_token_in_last_step
=
set
(
range
(
batch_size
))
seq_group_metadata_list
=
create_seq_group_metadata_from_prompts
(
prompts
,
num_gpu_blocks
,
block_size
,
continuations
=
multi_step_continuations
,
final_prompt_lens
=
final_prompt_lens
)
# Run multi-step.
zero_kv_cache
(
multi_step_worker
.
cache_engine
)
multi_step_worker
.
sampler_output
(
execute_model_req
=
ExecuteModelRequest
(
seq_group_metadata_list
=
seq_group_metadata_list
),
sample_len
=
num_steps
,
seq_ids_with_bonus_token_in_last_step
=
seq_ids_with_bonus_token_in_last_step
)
# Run single-step repeatedly.
zero_kv_cache
(
worker
.
cache_engine
)
# Generate the kv cache for the bonus token first
single_step_continuations
=
[
c
[:
1
]
for
c
in
multi_step_continuations
]
seq_group_metadata_list
=
create_seq_group_metadata_from_prompts
(
prompts
,
num_gpu_blocks
,
block_size
,
continuations
=
single_step_continuations
,
final_prompt_lens
=
final_prompt_lens
)
single_step_output
=
worker
.
execute_model
(
execute_model_req
=
ExecuteModelRequest
(
seq_group_metadata_list
=
seq_group_metadata_list
))
for
_
in
range
(
num_steps
):
seq_group_metadata_list
=
create_seq_group_metadata_from_prompts
(
prompts
,
num_gpu_blocks
,
block_size
,
continuations
=
multi_step_continuations
,
final_prompt_lens
=
final_prompt_lens
)
single_step_output
=
worker
.
execute_model
(
execute_model_req
=
ExecuteModelRequest
(
seq_group_metadata_list
=
seq_group_metadata_list
))
for
i
,
seq_group_output
in
enumerate
(
single_step_output
[
-
1
]):
multi_step_continuations
[
i
].
append
(
seq_group_output
.
samples
[
0
].
output_token
)
# Verify that the KV cache of the single-step and
# multi-step workers are the same.
single_step_gpu_cache
=
worker
.
cache_engine
[
0
].
gpu_cache
multi_step_gpu_cache
=
multi_step_worker
.
cache_engine
[
0
].
gpu_cache
num_layers
=
len
(
single_step_gpu_cache
)
allclose
=
lambda
a
,
b
:
torch
.
allclose
(
a
.
cuda
(),
b
.
cuda
(),
rtol
=
1e-2
,
atol
=
1e-2
)
for
i
in
range
(
num_layers
):
assert
allclose
(
single_step_gpu_cache
[
i
][
0
],
multi_step_gpu_cache
[
i
][
0
])
assert
allclose
(
single_step_gpu_cache
[
i
][
1
],
multi_step_gpu_cache
[
i
][
1
])
#
@torch.inference_mode()
#
@pytest.mark.parametrize('num_steps', [1, 2, 3, 4])
#
# The choice of backends forces the multi_step_worker to choose between
#
# the vanilla model_runner and TP1DraftModelRunner and that we can test
#
# both code paths.
#
@pytest.mark.parametrize('attn_backend',
#
[_Backend.XFORMERS, _Backend.FLASH_ATTN])
#
def test_multi_step_correct_kvcache(num_steps, attn_backend):
#
"""Verify that the KV cache of the draft model
#
is correctly updated for sequences with bonus token.
#
"""
#
seed = 100
#
model_name = "JackFram/llama-68m"
#
block_size = 16
#
num_gpu_blocks = 2048 // block_size
#
batch_size = 1
#
with global_force_attn_backend_context_manager(attn_backend):
#
dtype = 'float16' if attn_backend == _Backend.FLASH_ATTN else 'float32'
#
multi_step_worker = create_worker(MultiStepWorker,
#
model_name,
#
block_size,
#
num_gpu_blocks,
#
seed,
#
model_runner_cls=TP1DraftModelRunner,
#
dtype=dtype)
#
multi_step_worker.set_include_gpu_probs_tensor()
#
worker = create_worker(Worker,
#
model_name,
#
block_size,
#
num_gpu_blocks,
#
seed,
#
dtype=dtype)
#
prompts = [[0] for _ in range(batch_size)]
#
# Already generate two tokens for the sequence
#
# so that we can simulate the bonus token case
#
multi_step_continuations = [[
#
random.randint(0, 1000),
#
random.randint(0, 1000)
#
] for _ in prompts]
#
final_prompt_lens = [len(prompt) + 2 + num_steps for prompt in prompts]
#
seq_ids_with_bonus_token_in_last_step = set(range(batch_size))
#
seq_group_metadata_list = create_seq_group_metadata_from_prompts(
#
prompts,
#
num_gpu_blocks,
#
block_size,
#
continuations=multi_step_continuations,
#
final_prompt_lens=final_prompt_lens)
#
# Run multi-step.
#
zero_kv_cache(multi_step_worker.cache_engine)
#
multi_step_worker.sampler_output(execute_model_req=ExecuteModelRequest(
#
seq_group_metadata_list=seq_group_metadata_list),
#
sample_len=num_steps,
#
seq_ids_with_bonus_token_in_last_step=
#
seq_ids_with_bonus_token_in_last_step)
#
# Run single-step repeatedly.
#
zero_kv_cache(worker.cache_engine)
#
# Generate the kv cache for the bonus token first
#
single_step_continuations = [c[:1] for c in multi_step_continuations]
#
seq_group_metadata_list = create_seq_group_metadata_from_prompts(
#
prompts,
#
num_gpu_blocks,
#
block_size,
#
continuations=single_step_continuations,
#
final_prompt_lens=final_prompt_lens)
#
single_step_output = worker.execute_model(
#
execute_model_req=ExecuteModelRequest(
#
seq_group_metadata_list=seq_group_metadata_list))
#
for _ in range(num_steps):
#
seq_group_metadata_list = create_seq_group_metadata_from_prompts(
#
prompts,
#
num_gpu_blocks,
#
block_size,
#
continuations=multi_step_continuations,
#
final_prompt_lens=final_prompt_lens)
#
single_step_output = worker.execute_model(
#
execute_model_req=ExecuteModelRequest(
#
seq_group_metadata_list=seq_group_metadata_list))
#
for i, seq_group_output in enumerate(single_step_output[-1]):
#
multi_step_continuations[i].append(
#
seq_group_output.samples[0].output_token)
#
# Verify that the KV cache of the single-step and
#
# multi-step workers are the same.
#
single_step_gpu_cache = worker.cache_engine[0].gpu_cache
#
multi_step_gpu_cache = multi_step_worker.cache_engine[0].gpu_cache
#
num_layers = len(single_step_gpu_cache)
#
allclose = lambda a, b: torch.allclose(
#
a.cuda(), b.cuda(), rtol=1e-2, atol=1e-2)
#
for i in range(num_layers):
#
assert allclose(single_step_gpu_cache[i][0],
#
multi_step_gpu_cache[i][0])
#
assert allclose(single_step_gpu_cache[i][1],
#
multi_step_gpu_cache[i][1])
@
torch
.
inference_mode
()
...
...
tests/spec_decode/test_spec_decode_worker.py
View file @
bd363067
...
...
@@ -5,6 +5,7 @@ from collections import defaultdict
from
types
import
SimpleNamespace
from
unittest.mock
import
MagicMock
import
os
import
pytest
import
torch
...
...
@@ -24,6 +25,7 @@ from vllm.worker.worker import Worker
from
.test_utils
import
mock_spec_decode_sampler
from
.utils
import
(
create_batch
,
create_sampler_output_list
,
create_worker
,
mock_worker
)
from
..utils
import
models_path_prefix
@
pytest
.
mark
.
parametrize
(
'k'
,
[
1
,
2
,
6
])
...
...
@@ -918,14 +920,14 @@ def test_correctly_load_weight_for_eagle():
num_gpu_blocks
=
8096
//
block_size
target_worker
=
create_worker
(
Worker
,
"JackFram/llama-68m"
,
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
,
block_size
,
num_gpu_blocks
,
seed
,
)
draft_worker
=
create_worker
(
MultiStepWorker
,
"abhigoyal/vllm-eagle-llama-68m-random"
,
os
.
path
.
join
(
models_path_prefix
,
"abhigoyal/vllm-eagle-llama-68m-random"
)
,
block_size
,
num_gpu_blocks
,
seed
,
...
...
tests/tensorizer_loader/test_tensorizer.py
View file @
bd363067
...
...
@@ -7,6 +7,7 @@ import pathlib
import
subprocess
from
functools
import
partial
from
unittest.mock
import
MagicMock
,
patch
from
typing
import
List
,
Tuple
,
Optional
import
openai
import
pytest
...
...
@@ -15,6 +16,7 @@ from huggingface_hub import snapshot_download
from
vllm
import
EngineArgs
,
LLMEngine
,
RequestOutput
,
SamplingParams
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.lora.request
import
LoRARequest
# yapf conflicts with isort for this docstring
# yapf: disable
from
vllm.model_executor.model_loader.tensorizer
import
(
TensorizerConfig
,
...
...
@@ -243,7 +245,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
EXAMPLES_PATH
/
"offline_inference/multilora_inference.py"
,
)
model_ref
=
"meta-llama/Llama-2-7b-hf"
model_ref
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-2-7b-hf"
)
# lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
lora_path
=
os
.
path
.
join
(
models_path_prefix
,
"yard1/llama-2-7b-sql-lora-test"
)
test_prompts
=
multilora_inference
.
create_test_prompts
(
lora_path
)
...
...
tests/test_config.py
View file @
bd363067
...
...
@@ -142,7 +142,7 @@ def test_get_sliding_window():
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
"Xformers backend is not supported on ROCm."
)
def
test_get_pooling_config
():
model_id
=
"sentence-transformers/all-MiniLM-L12-v2"
model_id
=
os
.
path
.
join
(
models_path_prefix
,
"sentence-transformers/all-MiniLM-L12-v2"
)
model_config
=
ModelConfig
(
model_id
,
task
=
"auto"
,
...
...
@@ -164,7 +164,7 @@ def test_get_pooling_config():
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
"Xformers backend is not supported on ROCm."
)
def
test_get_pooling_config_from_args
():
model_id
=
"sentence-transformers/all-MiniLM-L12-v2"
model_id
=
os
.
path
.
join
(
models_path_prefix
,
"sentence-transformers/all-MiniLM-L12-v2"
)
model_config
=
ModelConfig
(
model_id
,
task
=
"auto"
,
tokenizer
=
model_id
,
...
...
@@ -273,10 +273,10 @@ def test_rope_customization():
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
"Encoder Decoder models not supported on ROCm."
)
@
pytest
.
mark
.
parametrize
((
"model_id"
,
"is_encoder_decoder"
),
[
(
"facebook/opt-125m"
,
False
),
(
"facebook/bart-base"
,
True
),
(
"meta-llama/Llama-3.2-1B-Instruct"
,
False
),
(
"meta-llama/Llama-3.2-11B-Vision"
,
True
),
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
False
),
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/bart-base"
)
,
True
),
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
)
,
False
),
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-11B-Vision"
)
,
True
),
])
def
test_is_encoder_decoder
(
model_id
,
is_encoder_decoder
):
config
=
ModelConfig
(
...
...
@@ -293,8 +293,8 @@ def test_is_encoder_decoder(model_id, is_encoder_decoder):
@
pytest
.
mark
.
parametrize
((
"model_id"
,
"uses_mrope"
),
[
(
"facebook/opt-125m"
,
False
),
(
"Qwen/Qwen2-VL-2B-Instruct"
,
True
),
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
False
),
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-VL-2B-Instruct"
)
,
True
),
])
def
test_uses_mrope
(
model_id
,
uses_mrope
):
config
=
ModelConfig
(
...
...
@@ -311,7 +311,7 @@ def test_uses_mrope(model_id, uses_mrope):
def
test_generation_config_loading
():
model_id
=
"Qwen/Qwen2.5-1.5B-Instruct"
model_id
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-1.5B-Instruct"
)
# When set generation_config to "vllm", the default generation config
# will not be loaded.
...
...
tests/test_regression.py
View file @
bd363067
...
...
@@ -5,6 +5,7 @@ It should include tests that are reported by users and making sure they
will never happen again.
"""
import
os
import
gc
import
pytest
...
...
@@ -12,8 +13,9 @@ import torch
from
vllm
import
LLM
,
SamplingParams
from
utils
import
models_path_prefix
import
os
from
.utils
import
models_path_prefix
from
vllm.utils
import
SUPPORT_TC
,
gpuname
import
vllm.envs
as
envs
@
pytest
.
mark
.
skip
(
reason
=
"In V1, we reject tokens > max_seq_len"
)
...
...
@@ -23,7 +25,7 @@ def test_duplicated_ignored_sequence_group():
sampling_params
=
SamplingParams
(
temperature
=
0.01
,
top_p
=
0.1
,
max_tokens
=
256
)
llm
=
LLM
(
model
=
"distilbert/distilgpt2"
,
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
)
,
max_num_batched_tokens
=
4096
,
tensor_parallel_size
=
1
)
prompts
=
[
"This is a short prompt"
,
"This is a very long prompt "
*
1000
]
...
...
@@ -36,7 +38,13 @@ def test_max_tokens_none():
sampling_params
=
SamplingParams
(
temperature
=
0.01
,
top_p
=
0.1
,
max_tokens
=
None
)
llm
=
LLM
(
model
=
"distilbert/distilgpt2"
,
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
:
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
max_num_batched_tokens
=
4096
,
tensor_parallel_size
=
1
,
block_size
=
64
)
else
:
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
max_num_batched_tokens
=
4096
,
tensor_parallel_size
=
1
)
prompts
=
[
"Just say hello!"
]
...
...
@@ -46,7 +54,7 @@ def test_max_tokens_none():
def
test_gc
():
llm
=
LLM
(
model
=
"distilbert/distilgpt2"
,
enforce_eager
=
True
)
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
)
,
enforce_eager
=
True
)
del
llm
gc
.
collect
()
...
...
@@ -63,6 +71,9 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
# model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_MODELSCOPE"
,
"True"
)
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
:
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"qwen/Qwen1.5-0.5B-Chat"
),
block_size
=
64
)
else
:
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"qwen/Qwen1.5-0.5B-Chat"
))
prompts
=
[
...
...
tests/test_sampling_params.py
View file @
bd363067
...
...
@@ -2,13 +2,15 @@
"""Tests for the SamplingParams class.
"""
import
os
import
pytest
from
vllm
import
SamplingParams
from
vllm.config
import
ModelConfig
from
vllm.entrypoints.openai.protocol
import
ChatCompletionRequest
from
utils
import
models_path_prefix
MODEL_NAME
=
"Qwen/Qwen1.5-7B"
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen1.5-7B"
)
def
test_max_tokens_none
():
...
...
tests/test_utils.py
View file @
bd363067
...
...
@@ -8,6 +8,7 @@ import socket
from
collections.abc
import
AsyncIterator
from
unittest.mock
import
patch
import
os
import
pytest
import
torch
from
vllm_test_utils.monitor
import
monitor
...
...
tests/tool_use/utils.py
View file @
bd363067
...
...
@@ -66,7 +66,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
"llama"
:
{
"model"
:
"meta-llama/Meta-Llama-3.1-8B-Instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Meta-Llama-3.1-8B-Instruct"
)
,
"arguments"
:
[
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--tool-call-parser"
,
"llama3_json"
,
"--chat-template"
,
...
...
@@ -77,7 +77,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
"llama3.2"
:
{
"model"
:
"meta-llama/Llama-3.2-3B-Instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-3B-Instruct"
)
,
"arguments"
:
[
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--tool-call-parser"
,
"llama3_json"
,
"--chat-template"
,
...
...
@@ -88,7 +88,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
"llama4"
:
{
"model"
:
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
)
,
"arguments"
:
[
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--tool-call-parser"
,
"pythonic"
,
"--chat-template"
,
...
...
@@ -103,7 +103,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
"llama4_json"
:
{
"model"
:
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
)
,
"arguments"
:
[
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"-tp"
,
"4"
,
"--distributed-executor-backend"
,
"mp"
,
"--tool-call-parser"
,
...
...
@@ -149,7 +149,7 @@ CONFIGS: dict[str, ServerConfig] = {
# },
"granite-3.0-8b"
:
{
"model"
:
"ibm-granite/granite-3.0-8b-instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"ibm-granite/granite-3.0-8b-instruct"
)
,
"arguments"
:
[
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--tool-call-parser"
,
"granite"
,
"--chat-template"
,
...
...
@@ -158,7 +158,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
"granite-3.1-8b"
:
{
"model"
:
"ibm-granite/granite-3.1-8b-instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"ibm-granite/granite-3.1-8b-instruct"
)
,
"arguments"
:
[
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
...
...
@@ -170,7 +170,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
"internlm"
:
{
"model"
:
"internlm/internlm2_5-7b-chat"
,
os
.
path
.
join
(
models_path_prefix
,
"internlm/internlm2_5-7b-chat"
)
,
"arguments"
:
[
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--tool-call-parser"
,
"internlm"
,
"--chat-template"
,
...
...
@@ -183,7 +183,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
"toolACE"
:
{
"model"
:
"Team-ACE/ToolACE-8B"
,
os
.
path
.
join
(
models_path_prefix
,
"Team-ACE/ToolACE-8B"
)
,
"arguments"
:
[
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--tool-call-parser"
,
"pythonic"
,
"--chat-template"
,
...
...
tests/tpu/test_quantization_accuracy.py
→
tests/tpu/
un
test_quantization_accuracy.py
View file @
bd363067
...
...
@@ -4,6 +4,8 @@ from dataclasses import dataclass
import
lm_eval
import
pytest
import
os
from
..utils
import
models_path_prefix
TASK
=
"gsm8k"
FILTER
=
"exact_match,strict-match"
...
...
@@ -23,7 +25,7 @@ class GSM8KAccuracyTestConfig:
# NOTE: Accuracy scores measured on GPUs.
ACCURACY_CONFIGS
=
[
GSM8KAccuracyTestConfig
(
model_name
=
"neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
model_name
=
os
.
path
.
join
(
models_path_prefix
,
"neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
)
,
excepted_value
=
0.76
),
# no bias
# NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
# so only one of these tests can run in a single call to pytest. As
...
...
tests/v1/core/__init__.py
0 → 100644
View file @
bd363067
tests/v1/core/test_kv_cache_utils.py
View file @
bd363067
# SPDX-License-Identifier: Apache-2.0
import
os
import
pytest
import
torch
...
...
@@ -22,6 +23,7 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
KVCacheGroupSpec
,
KVCacheTensor
)
from
vllm.v1.metrics.stats
import
PrefixCacheStats
from
vllm.v1.request
import
Request
from
...utils
import
models_path_prefix
# yapf: enable
...
...
@@ -432,8 +434,8 @@ def test_unify_kv_cache_configs():
@
pytest
.
mark
.
parametrize
(
(
"model_id"
,
"max_model_len"
,
"want_estimated_max_len"
),
[
(
"Qwen/Qwen1.5-7B"
,
16385
,
16384
),
(
"Qwen/Qwen1.5-7B"
,
16383
,
16383
),
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen1.5-7B"
)
,
16385
,
16384
),
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen1.5-7B"
)
,
16383
,
16383
),
])
def
test_estimate_max_model_len
(
model_id
,
max_model_len
,
want_estimated_max_len
):
...
...
tests/v1/core/test_scheduler.py
View file @
bd363067
...
...
@@ -2,6 +2,7 @@
from
typing
import
Optional
from
unittest.mock
import
Mock
import
os
import
pytest
import
torch
...
...
@@ -16,12 +17,13 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
from
vllm.v1.outputs
import
ModelRunnerOutput
from
vllm.v1.request
import
Request
,
RequestStatus
from
vllm.v1.structured_output
import
StructuredOutputManager
from
...utils
import
models_path_prefix
EOS_TOKEN_ID
=
50256
def
create_scheduler
(
model
:
str
=
"facebook/opt-125m"
,
model
:
str
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
max_num_seqs
:
int
=
16
,
max_num_batched_tokens
:
int
=
8192
,
enable_prefix_caching
:
Optional
[
bool
]
=
None
,
...
...
@@ -211,7 +213,7 @@ def test_schedule(enable_prefix_caching: Optional[bool],
def
test_schedule_multimodal_requests
():
scheduler
=
create_scheduler
(
model
=
"llava-hf/llava-1.5-7b-hf"
)
scheduler
=
create_scheduler
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-1.5-7b-hf"
)
)
mm_positions
=
[[
PlaceholderRange
(
offset
=
i
,
length
=
100
)]
for
i
in
range
(
10
)]
requests
=
create_requests
(
...
...
@@ -243,7 +245,7 @@ def test_schedule_partial_requests():
there is insufficient encoder budget.
"""
scheduler
=
create_scheduler
(
model
=
"llava-hf/llava-1.5-7b-hf"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-1.5-7b-hf"
)
,
max_num_batched_tokens
=
1024
,
)
mm_positions
=
[[
PlaceholderRange
(
offset
=
100
,
length
=
600
)]
...
...
@@ -303,7 +305,7 @@ def test_schedule_partial_requests():
def
test_no_mm_input_chunking
():
# Disable multimodal input chunking.
scheduler
=
create_scheduler
(
model
=
"llava-hf/llava-1.5-7b-hf"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-1.5-7b-hf"
)
,
max_num_batched_tokens
=
1024
,
disable_chunked_mm_input
=
True
,
max_model_len
=
2048
,
...
...
@@ -347,7 +349,7 @@ def test_no_mm_input_chunking():
# of a max_num_batched_tokens for the mm input.
with
pytest
.
raises
(
ValueError
):
_
=
create_scheduler
(
model
=
"llava-hf/llava-1.5-7b-hf"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-1.5-7b-hf"
)
,
max_num_batched_tokens
=
100
,
disable_chunked_mm_input
=
True
,
)
...
...
@@ -362,7 +364,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
"""
scheduler
=
create_scheduler
(
model
=
"facebook/opt-125m"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
max_num_batched_tokens
=
1024
,
long_prefill_token_threshold
=
400
,
enable_prefix_caching
=
enable_prefix_caching
,
...
...
tests/v1/core/test_scheduler_e2e.py
View file @
bd363067
...
...
@@ -4,11 +4,12 @@ import os
import
pytest
from
vllm
import
LLM
from
...utils
import
models_path_prefix
if
os
.
getenv
(
"VLLM_USE_V1"
,
"0"
)
!=
"1"
:
pytest
.
skip
(
"Test package requires V1"
,
allow_module_level
=
True
)
MODEL
=
"meta-llama/Llama-3.2-1B"
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B"
)
PROMPT
=
"Hello my name is Robert and I"
...
...
tests/v1/shutdown/__init__.py
0 → 100644
View file @
bd363067
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment