Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
bd363067
Commit
bd363067
authored
Jun 05, 2025
by
lizhigong
Browse files
Merge branch 'v0.8.5.post1-dev' into v0.8.5-zero_overhead
parents
87ef4618
d36deb1a
Changes
106
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
197 additions
and
170 deletions
+197
-170
tests/samplers/test_beam_search.py
tests/samplers/test_beam_search.py
+3
-3
tests/spec_decode/e2e/test_compatibility.py
tests/spec_decode/e2e/test_compatibility.py
+4
-4
tests/spec_decode/e2e/test_eagle_correctness.py
tests/spec_decode/e2e/test_eagle_correctness.py
+6
-6
tests/spec_decode/e2e/test_integration.py
tests/spec_decode/e2e/test_integration.py
+5
-5
tests/spec_decode/e2e/test_mtp_correctness.py
tests/spec_decode/e2e/test_mtp_correctness.py
+4
-2
tests/spec_decode/e2e/test_ngram_correctness.py
tests/spec_decode/e2e/test_ngram_correctness.py
+2
-2
tests/spec_decode/test_multi_step_worker.py
tests/spec_decode/test_multi_step_worker.py
+101
-101
tests/spec_decode/test_spec_decode_worker.py
tests/spec_decode/test_spec_decode_worker.py
+5
-3
tests/tensorizer_loader/test_tensorizer.py
tests/tensorizer_loader/test_tensorizer.py
+4
-2
tests/test_config.py
tests/test_config.py
+10
-10
tests/test_regression.py
tests/test_regression.py
+20
-9
tests/test_sampling_params.py
tests/test_sampling_params.py
+3
-1
tests/test_utils.py
tests/test_utils.py
+1
-0
tests/tool_use/utils.py
tests/tool_use/utils.py
+9
-9
tests/tpu/untest_quantization_accuracy.py
tests/tpu/untest_quantization_accuracy.py
+4
-2
tests/v1/core/__init__.py
tests/v1/core/__init__.py
+0
-0
tests/v1/core/test_kv_cache_utils.py
tests/v1/core/test_kv_cache_utils.py
+4
-2
tests/v1/core/test_scheduler.py
tests/v1/core/test_scheduler.py
+9
-7
tests/v1/core/test_scheduler_e2e.py
tests/v1/core/test_scheduler_e2e.py
+3
-2
tests/v1/shutdown/__init__.py
tests/v1/shutdown/__init__.py
+0
-0
No files found.
tests/samplers/test_beam_search.py
View file @
bd363067
...
@@ -6,10 +6,10 @@ Run `pytest tests/samplers/test_beam_search.py`.
...
@@ -6,10 +6,10 @@ Run `pytest tests/samplers/test_beam_search.py`.
import
pytest
import
pytest
import
os
import
os
from
..utils
import
models_path_prefix
from
transformers
import
AutoModelForSeq2SeqLM
from
transformers
import
AutoModelForSeq2SeqLM
from
vllm.assets.audio
import
AudioAsset
from
vllm.assets.audio
import
AudioAsset
from
..utils
import
models_path_prefix
@
pytest
.
fixture
(
autouse
=
True
)
@
pytest
.
fixture
(
autouse
=
True
)
...
@@ -83,7 +83,7 @@ def test_beam_search_passes_multimodal_data(
...
@@ -83,7 +83,7 @@ def test_beam_search_passes_multimodal_data(
# correctly. As such, we just need to check one extra modality to make
# correctly. As such, we just need to check one extra modality to make
# sure things pass through properly.
# sure things pass through properly.
audios
=
[
AudioAsset
(
"mary_had_lamb"
).
audio_and_sample_rate
]
audios
=
[
AudioAsset
(
"mary_had_lamb"
).
audio_and_sample_rate
]
model
=
"Qwen/Qwen2-Audio-7B-Instruct"
model
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-Audio-7B-Instruct"
)
audio_seq
=
"<|audio_bos|><|AUDIO|><|audio_eos|>"
audio_seq
=
"<|audio_bos|><|AUDIO|><|audio_eos|>"
prompts
=
[
prompts
=
[
f
"<|im_start|>user
\n
{
audio_seq
}
Can you transcribe this?<|im_end|>
\n
<|im_start|>assistant
\n
"
#noqa: E501
f
"<|im_start|>user
\n
{
audio_seq
}
Can you transcribe this?<|im_end|>
\n
<|im_start|>assistant
\n
"
#noqa: E501
...
...
tests/spec_decode/e2e/test_compatibility.py
View file @
bd363067
...
@@ -19,7 +19,7 @@ from ...utils import models_path_prefix
...
@@ -19,7 +19,7 @@ from ...utils import models_path_prefix
{
{
# Speculative max model len > overridden max model len should raise.
# Speculative max model len > overridden max model len should raise.
"speculative_config"
:
{
"speculative_config"
:
{
"model"
:
"JackFram/llama-68m"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
,
"num_speculative_tokens"
:
5
,
"num_speculative_tokens"
:
5
,
"max_model_len"
:
129
,
"max_model_len"
:
129
,
},
},
...
@@ -29,7 +29,7 @@ from ...utils import models_path_prefix
...
@@ -29,7 +29,7 @@ from ...utils import models_path_prefix
# Speculative max model len > draft max model len should raise.
# Speculative max model len > draft max model len should raise.
# https://huggingface.co/JackFram/llama-68m/blob/3b606af5198a0b26762d589a3ee3d26ee6fa6c85/config.json#L12
# https://huggingface.co/JackFram/llama-68m/blob/3b606af5198a0b26762d589a3ee3d26ee6fa6c85/config.json#L12
"speculative_config"
:
{
"speculative_config"
:
{
"model"
:
"JackFram/llama-68m"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
,
"num_speculative_tokens"
:
5
,
"num_speculative_tokens"
:
5
,
"max_model_len"
:
2048
+
1
,
"max_model_len"
:
2048
+
1
,
},
},
...
@@ -38,7 +38,7 @@ from ...utils import models_path_prefix
...
@@ -38,7 +38,7 @@ from ...utils import models_path_prefix
# Speculative max model len > target max model len should raise.
# Speculative max model len > target max model len should raise.
# https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
# https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
"speculative_config"
:
{
"speculative_config"
:
{
"model"
:
"JackFram/llama-68m"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
,
"num_speculative_tokens"
:
5
,
"num_speculative_tokens"
:
5
,
"max_model_len"
:
131072
+
1
,
"max_model_len"
:
131072
+
1
,
},
},
...
...
tests/spec_decode/e2e/test_eagle_correctness.py
View file @
bd363067
...
@@ -332,14 +332,14 @@ def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
...
@@ -332,14 +332,14 @@ def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
"dtype"
:
"float16"
,
"dtype"
:
"float16"
,
# Main model
# Main model
"model_name"
:
"meta-llama/Llama-2-7b-chat-hf"
,
"model_name"
:
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-2-7b-chat-hf"
)
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
{
"speculative_config"
:
{
"speculative_config"
:
{
"model"
:
"yuhuili/EAGLE-llama2-chat-7B"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"yuhuili/EAGLE-llama2-chat-7B"
)
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
},
},
},
},
...
@@ -382,14 +382,14 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
...
@@ -382,14 +382,14 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
"dtype"
:
"float16"
,
"dtype"
:
"float16"
,
# Main model
# Main model
"model_name"
:
"meta-llama/Meta-Llama-3-8B-Instruct"
,
"model_name"
:
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Meta-Llama-3-8B-Instruct"
)
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
{
"speculative_config"
:
{
"speculative_config"
:
{
"model"
:
"yuhuili/EAGLE-LLaMA3-Instruct-8B"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"yuhuili/EAGLE-LLaMA3-Instruct-8B"
)
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
},
},
},
},
...
@@ -432,14 +432,14 @@ def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
...
@@ -432,14 +432,14 @@ def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
"dtype"
:
"float16"
,
"dtype"
:
"float16"
,
# Main model
# Main model
"model_name"
:
"Qwen/Qwen2-7B-Instruct"
,
"model_name"
:
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-7B-Instruct"
)
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
{
"speculative_config"
:
{
"speculative_config"
:
{
"model"
:
"yuhuili/EAGLE-Qwen2-7B-Instruct"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"yuhuili/EAGLE-Qwen2-7B-Instruct"
)
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
},
},
},
},
...
...
tests/spec_decode/e2e/test_integration.py
View file @
bd363067
...
@@ -69,7 +69,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
...
@@ -69,7 +69,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Explicitly specify draft model quantization
# Explicitly specify draft model quantization
{
{
"speculative_config"
:
{
"speculative_config"
:
{
"model"
:
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
)
,
"num_speculative_tokens"
:
5
,
"num_speculative_tokens"
:
5
,
"quantization"
:
"gptq"
,
"quantization"
:
"gptq"
,
},
},
...
@@ -77,7 +77,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
...
@@ -77,7 +77,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Explicitly specify GPTQ-based draft model to use marlin quantization
# Explicitly specify GPTQ-based draft model to use marlin quantization
{
{
"speculative_config"
:
{
"speculative_config"
:
{
"model"
:
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
)
,
"num_speculative_tokens"
:
5
,
"num_speculative_tokens"
:
5
,
"quantization"
:
"marlin"
,
"quantization"
:
"marlin"
,
},
},
...
@@ -85,7 +85,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
...
@@ -85,7 +85,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Not explicitly specify draft model quantization
# Not explicitly specify draft model quantization
{
{
"speculative_config"
:
{
"speculative_config"
:
{
"model"
:
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
)
,
"num_speculative_tokens"
:
5
,
"num_speculative_tokens"
:
5
,
"quantization"
:
None
,
"quantization"
:
None
,
},
},
...
@@ -124,7 +124,7 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
...
@@ -124,7 +124,7 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"speculative_config"
:
{
"speculative_config"
:
{
"model"
:
"JackFram/llama-68m"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
,
"num_speculative_tokens"
:
3
,
"num_speculative_tokens"
:
3
,
"disable_mqa_scorer"
:
True
,
"disable_mqa_scorer"
:
True
,
},
},
...
...
tests/spec_decode/e2e/test_mtp_correctness.py
View file @
bd363067
...
@@ -20,12 +20,14 @@ With those tests, we can say at least, mtp would not break the
...
@@ -20,12 +20,14 @@ With those tests, we can say at least, mtp would not break the
correctess for the target model outputs.
correctess for the target model outputs.
"""
"""
import
os
import
pytest
import
pytest
from
.conftest
import
run_equality_correctness_test
from
.conftest
import
run_equality_correctness_test
from
...utils
import
models_path_prefix
# main model
# main model
MAIN_MODEL
=
"luccafong/deepseek_mtp_main_random"
MAIN_MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"luccafong/deepseek_mtp_main_random"
)
# max. number of speculative tokens: this corresponds to
# max. number of speculative tokens: this corresponds to
# num_nextn_predict_layers in the config.json of the speculator model.
# num_nextn_predict_layers in the config.json of the speculator model.
...
...
tests/spec_decode/e2e/test_ngram_correctness.py
View file @
bd363067
...
@@ -334,7 +334,7 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
...
@@ -334,7 +334,7 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
"common_llm_kwargs"
,
[{
[{
"model_name"
:
"JackFram/llama-68m"
,
"model_name"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
...
...
tests/spec_decode/test_multi_step_worker.py
View file @
bd363067
...
@@ -484,107 +484,107 @@ def test_multi_step_with_batch_expansion_incorrect_output():
...
@@ -484,107 +484,107 @@ def test_multi_step_with_batch_expansion_incorrect_output():
assert
(
num_mismatch
>
0
)
assert
(
num_mismatch
>
0
)
@
torch
.
inference_mode
()
#
@torch.inference_mode()
@
pytest
.
mark
.
parametrize
(
'num_steps'
,
[
1
,
2
,
3
,
4
])
#
@pytest.mark.parametrize('num_steps', [1, 2, 3, 4])
# The choice of backends forces the multi_step_worker to choose between
#
# The choice of backends forces the multi_step_worker to choose between
# the vanilla model_runner and TP1DraftModelRunner and that we can test
#
# the vanilla model_runner and TP1DraftModelRunner and that we can test
# both code paths.
#
# both code paths.
@
pytest
.
mark
.
parametrize
(
'attn_backend'
,
#
@pytest.mark.parametrize('attn_backend',
[
_Backend
.
XFORMERS
,
_Backend
.
FLASH_ATTN
])
#
[_Backend.XFORMERS, _Backend.FLASH_ATTN])
def
test_multi_step_correct_kvcache
(
num_steps
,
attn_backend
):
#
def test_multi_step_correct_kvcache(num_steps, attn_backend):
"""Verify that the KV cache of the draft model
#
"""Verify that the KV cache of the draft model
is correctly updated for sequences with bonus token.
#
is correctly updated for sequences with bonus token.
"""
#
"""
seed
=
100
#
seed = 100
model_name
=
"JackFram/llama-68m"
#
model_name = "JackFram/llama-68m"
block_size
=
16
#
block_size = 16
num_gpu_blocks
=
2048
//
block_size
#
num_gpu_blocks = 2048 // block_size
batch_size
=
1
#
batch_size = 1
with
global_force_attn_backend_context_manager
(
attn_backend
):
#
with global_force_attn_backend_context_manager(attn_backend):
dtype
=
'float16'
if
attn_backend
==
_Backend
.
FLASH_ATTN
else
'float32'
#
dtype = 'float16' if attn_backend == _Backend.FLASH_ATTN else 'float32'
multi_step_worker
=
create_worker
(
MultiStepWorker
,
#
multi_step_worker = create_worker(MultiStepWorker,
model_name
,
#
model_name,
block_size
,
#
block_size,
num_gpu_blocks
,
#
num_gpu_blocks,
seed
,
#
seed,
model_runner_cls
=
TP1DraftModelRunner
,
#
model_runner_cls=TP1DraftModelRunner,
dtype
=
dtype
)
#
dtype=dtype)
multi_step_worker
.
set_include_gpu_probs_tensor
()
#
multi_step_worker.set_include_gpu_probs_tensor()
worker
=
create_worker
(
Worker
,
#
worker = create_worker(Worker,
model_name
,
#
model_name,
block_size
,
#
block_size,
num_gpu_blocks
,
#
num_gpu_blocks,
seed
,
#
seed,
dtype
=
dtype
)
#
dtype=dtype)
prompts
=
[[
0
]
for
_
in
range
(
batch_size
)]
#
prompts = [[0] for _ in range(batch_size)]
# Already generate two tokens for the sequence
#
# Already generate two tokens for the sequence
# so that we can simulate the bonus token case
#
# so that we can simulate the bonus token case
multi_step_continuations
=
[[
#
multi_step_continuations = [[
random
.
randint
(
0
,
1000
),
#
random.randint(0, 1000),
random
.
randint
(
0
,
1000
)
#
random.randint(0, 1000)
]
for
_
in
prompts
]
#
] for _ in prompts]
final_prompt_lens
=
[
len
(
prompt
)
+
2
+
num_steps
for
prompt
in
prompts
]
#
final_prompt_lens = [len(prompt) + 2 + num_steps for prompt in prompts]
seq_ids_with_bonus_token_in_last_step
=
set
(
range
(
batch_size
))
#
seq_ids_with_bonus_token_in_last_step = set(range(batch_size))
seq_group_metadata_list
=
create_seq_group_metadata_from_prompts
(
#
seq_group_metadata_list = create_seq_group_metadata_from_prompts(
prompts
,
#
prompts,
num_gpu_blocks
,
#
num_gpu_blocks,
block_size
,
#
block_size,
continuations
=
multi_step_continuations
,
#
continuations=multi_step_continuations,
final_prompt_lens
=
final_prompt_lens
)
#
final_prompt_lens=final_prompt_lens)
# Run multi-step.
#
# Run multi-step.
zero_kv_cache
(
multi_step_worker
.
cache_engine
)
#
zero_kv_cache(multi_step_worker.cache_engine)
multi_step_worker
.
sampler_output
(
execute_model_req
=
ExecuteModelRequest
(
#
multi_step_worker.sampler_output(execute_model_req=ExecuteModelRequest(
seq_group_metadata_list
=
seq_group_metadata_list
),
#
seq_group_metadata_list=seq_group_metadata_list),
sample_len
=
num_steps
,
#
sample_len=num_steps,
seq_ids_with_bonus_token_in_last_step
=
#
seq_ids_with_bonus_token_in_last_step=
seq_ids_with_bonus_token_in_last_step
)
#
seq_ids_with_bonus_token_in_last_step)
# Run single-step repeatedly.
#
# Run single-step repeatedly.
zero_kv_cache
(
worker
.
cache_engine
)
#
zero_kv_cache(worker.cache_engine)
# Generate the kv cache for the bonus token first
#
# Generate the kv cache for the bonus token first
single_step_continuations
=
[
c
[:
1
]
for
c
in
multi_step_continuations
]
#
single_step_continuations = [c[:1] for c in multi_step_continuations]
seq_group_metadata_list
=
create_seq_group_metadata_from_prompts
(
#
seq_group_metadata_list = create_seq_group_metadata_from_prompts(
prompts
,
#
prompts,
num_gpu_blocks
,
#
num_gpu_blocks,
block_size
,
#
block_size,
continuations
=
single_step_continuations
,
#
continuations=single_step_continuations,
final_prompt_lens
=
final_prompt_lens
)
#
final_prompt_lens=final_prompt_lens)
single_step_output
=
worker
.
execute_model
(
#
single_step_output = worker.execute_model(
execute_model_req
=
ExecuteModelRequest
(
#
execute_model_req=ExecuteModelRequest(
seq_group_metadata_list
=
seq_group_metadata_list
))
#
seq_group_metadata_list=seq_group_metadata_list))
for
_
in
range
(
num_steps
):
#
for _ in range(num_steps):
seq_group_metadata_list
=
create_seq_group_metadata_from_prompts
(
#
seq_group_metadata_list = create_seq_group_metadata_from_prompts(
prompts
,
#
prompts,
num_gpu_blocks
,
#
num_gpu_blocks,
block_size
,
#
block_size,
continuations
=
multi_step_continuations
,
#
continuations=multi_step_continuations,
final_prompt_lens
=
final_prompt_lens
)
#
final_prompt_lens=final_prompt_lens)
single_step_output
=
worker
.
execute_model
(
#
single_step_output = worker.execute_model(
execute_model_req
=
ExecuteModelRequest
(
#
execute_model_req=ExecuteModelRequest(
seq_group_metadata_list
=
seq_group_metadata_list
))
#
seq_group_metadata_list=seq_group_metadata_list))
for
i
,
seq_group_output
in
enumerate
(
single_step_output
[
-
1
]):
#
for i, seq_group_output in enumerate(single_step_output[-1]):
multi_step_continuations
[
i
].
append
(
#
multi_step_continuations[i].append(
seq_group_output
.
samples
[
0
].
output_token
)
#
seq_group_output.samples[0].output_token)
# Verify that the KV cache of the single-step and
#
# Verify that the KV cache of the single-step and
# multi-step workers are the same.
#
# multi-step workers are the same.
single_step_gpu_cache
=
worker
.
cache_engine
[
0
].
gpu_cache
#
single_step_gpu_cache = worker.cache_engine[0].gpu_cache
multi_step_gpu_cache
=
multi_step_worker
.
cache_engine
[
0
].
gpu_cache
#
multi_step_gpu_cache = multi_step_worker.cache_engine[0].gpu_cache
num_layers
=
len
(
single_step_gpu_cache
)
#
num_layers = len(single_step_gpu_cache)
allclose
=
lambda
a
,
b
:
torch
.
allclose
(
#
allclose = lambda a, b: torch.allclose(
a
.
cuda
(),
b
.
cuda
(),
rtol
=
1e-2
,
atol
=
1e-2
)
#
a.cuda(), b.cuda(), rtol=1e-2, atol=1e-2)
for
i
in
range
(
num_layers
):
#
for i in range(num_layers):
assert
allclose
(
single_step_gpu_cache
[
i
][
0
],
#
assert allclose(single_step_gpu_cache[i][0],
multi_step_gpu_cache
[
i
][
0
])
#
multi_step_gpu_cache[i][0])
assert
allclose
(
single_step_gpu_cache
[
i
][
1
],
#
assert allclose(single_step_gpu_cache[i][1],
multi_step_gpu_cache
[
i
][
1
])
#
multi_step_gpu_cache[i][1])
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
...
...
tests/spec_decode/test_spec_decode_worker.py
View file @
bd363067
...
@@ -5,6 +5,7 @@ from collections import defaultdict
...
@@ -5,6 +5,7 @@ from collections import defaultdict
from
types
import
SimpleNamespace
from
types
import
SimpleNamespace
from
unittest.mock
import
MagicMock
from
unittest.mock
import
MagicMock
import
os
import
pytest
import
pytest
import
torch
import
torch
...
@@ -24,6 +25,7 @@ from vllm.worker.worker import Worker
...
@@ -24,6 +25,7 @@ from vllm.worker.worker import Worker
from
.test_utils
import
mock_spec_decode_sampler
from
.test_utils
import
mock_spec_decode_sampler
from
.utils
import
(
create_batch
,
create_sampler_output_list
,
create_worker
,
from
.utils
import
(
create_batch
,
create_sampler_output_list
,
create_worker
,
mock_worker
)
mock_worker
)
from
..utils
import
models_path_prefix
@
pytest
.
mark
.
parametrize
(
'k'
,
[
1
,
2
,
6
])
@
pytest
.
mark
.
parametrize
(
'k'
,
[
1
,
2
,
6
])
...
@@ -918,14 +920,14 @@ def test_correctly_load_weight_for_eagle():
...
@@ -918,14 +920,14 @@ def test_correctly_load_weight_for_eagle():
num_gpu_blocks
=
8096
//
block_size
num_gpu_blocks
=
8096
//
block_size
target_worker
=
create_worker
(
target_worker
=
create_worker
(
Worker
,
Worker
,
"JackFram/llama-68m"
,
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
,
block_size
,
block_size
,
num_gpu_blocks
,
num_gpu_blocks
,
seed
,
seed
,
)
)
draft_worker
=
create_worker
(
draft_worker
=
create_worker
(
MultiStepWorker
,
MultiStepWorker
,
"abhigoyal/vllm-eagle-llama-68m-random"
,
os
.
path
.
join
(
models_path_prefix
,
"abhigoyal/vllm-eagle-llama-68m-random"
)
,
block_size
,
block_size
,
num_gpu_blocks
,
num_gpu_blocks
,
seed
,
seed
,
...
...
tests/tensorizer_loader/test_tensorizer.py
View file @
bd363067
...
@@ -7,6 +7,7 @@ import pathlib
...
@@ -7,6 +7,7 @@ import pathlib
import
subprocess
import
subprocess
from
functools
import
partial
from
functools
import
partial
from
unittest.mock
import
MagicMock
,
patch
from
unittest.mock
import
MagicMock
,
patch
from
typing
import
List
,
Tuple
,
Optional
import
openai
import
openai
import
pytest
import
pytest
...
@@ -15,6 +16,7 @@ from huggingface_hub import snapshot_download
...
@@ -15,6 +16,7 @@ from huggingface_hub import snapshot_download
from
vllm
import
EngineArgs
,
LLMEngine
,
RequestOutput
,
SamplingParams
from
vllm
import
EngineArgs
,
LLMEngine
,
RequestOutput
,
SamplingParams
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.lora.request
import
LoRARequest
# yapf conflicts with isort for this docstring
# yapf conflicts with isort for this docstring
# yapf: disable
# yapf: disable
from
vllm.model_executor.model_loader.tensorizer
import
(
TensorizerConfig
,
from
vllm.model_executor.model_loader.tensorizer
import
(
TensorizerConfig
,
...
@@ -243,7 +245,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
...
@@ -243,7 +245,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
EXAMPLES_PATH
/
"offline_inference/multilora_inference.py"
,
EXAMPLES_PATH
/
"offline_inference/multilora_inference.py"
,
)
)
model_ref
=
"meta-llama/Llama-2-7b-hf"
model_ref
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-2-7b-hf"
)
# lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
# lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
lora_path
=
os
.
path
.
join
(
models_path_prefix
,
"yard1/llama-2-7b-sql-lora-test"
)
lora_path
=
os
.
path
.
join
(
models_path_prefix
,
"yard1/llama-2-7b-sql-lora-test"
)
test_prompts
=
multilora_inference
.
create_test_prompts
(
lora_path
)
test_prompts
=
multilora_inference
.
create_test_prompts
(
lora_path
)
...
...
tests/test_config.py
View file @
bd363067
...
@@ -142,7 +142,7 @@ def test_get_sliding_window():
...
@@ -142,7 +142,7 @@ def test_get_sliding_window():
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
"Xformers backend is not supported on ROCm."
)
reason
=
"Xformers backend is not supported on ROCm."
)
def
test_get_pooling_config
():
def
test_get_pooling_config
():
model_id
=
"sentence-transformers/all-MiniLM-L12-v2"
model_id
=
os
.
path
.
join
(
models_path_prefix
,
"sentence-transformers/all-MiniLM-L12-v2"
)
model_config
=
ModelConfig
(
model_config
=
ModelConfig
(
model_id
,
model_id
,
task
=
"auto"
,
task
=
"auto"
,
...
@@ -164,7 +164,7 @@ def test_get_pooling_config():
...
@@ -164,7 +164,7 @@ def test_get_pooling_config():
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
"Xformers backend is not supported on ROCm."
)
reason
=
"Xformers backend is not supported on ROCm."
)
def
test_get_pooling_config_from_args
():
def
test_get_pooling_config_from_args
():
model_id
=
"sentence-transformers/all-MiniLM-L12-v2"
model_id
=
os
.
path
.
join
(
models_path_prefix
,
"sentence-transformers/all-MiniLM-L12-v2"
)
model_config
=
ModelConfig
(
model_id
,
model_config
=
ModelConfig
(
model_id
,
task
=
"auto"
,
task
=
"auto"
,
tokenizer
=
model_id
,
tokenizer
=
model_id
,
...
@@ -273,10 +273,10 @@ def test_rope_customization():
...
@@ -273,10 +273,10 @@ def test_rope_customization():
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
"Encoder Decoder models not supported on ROCm."
)
reason
=
"Encoder Decoder models not supported on ROCm."
)
@
pytest
.
mark
.
parametrize
((
"model_id"
,
"is_encoder_decoder"
),
[
@
pytest
.
mark
.
parametrize
((
"model_id"
,
"is_encoder_decoder"
),
[
(
"facebook/opt-125m"
,
False
),
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
False
),
(
"facebook/bart-base"
,
True
),
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/bart-base"
)
,
True
),
(
"meta-llama/Llama-3.2-1B-Instruct"
,
False
),
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
)
,
False
),
(
"meta-llama/Llama-3.2-11B-Vision"
,
True
),
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-11B-Vision"
)
,
True
),
])
])
def
test_is_encoder_decoder
(
model_id
,
is_encoder_decoder
):
def
test_is_encoder_decoder
(
model_id
,
is_encoder_decoder
):
config
=
ModelConfig
(
config
=
ModelConfig
(
...
@@ -293,8 +293,8 @@ def test_is_encoder_decoder(model_id, is_encoder_decoder):
...
@@ -293,8 +293,8 @@ def test_is_encoder_decoder(model_id, is_encoder_decoder):
@
pytest
.
mark
.
parametrize
((
"model_id"
,
"uses_mrope"
),
[
@
pytest
.
mark
.
parametrize
((
"model_id"
,
"uses_mrope"
),
[
(
"facebook/opt-125m"
,
False
),
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
False
),
(
"Qwen/Qwen2-VL-2B-Instruct"
,
True
),
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-VL-2B-Instruct"
)
,
True
),
])
])
def
test_uses_mrope
(
model_id
,
uses_mrope
):
def
test_uses_mrope
(
model_id
,
uses_mrope
):
config
=
ModelConfig
(
config
=
ModelConfig
(
...
@@ -311,7 +311,7 @@ def test_uses_mrope(model_id, uses_mrope):
...
@@ -311,7 +311,7 @@ def test_uses_mrope(model_id, uses_mrope):
def
test_generation_config_loading
():
def
test_generation_config_loading
():
model_id
=
"Qwen/Qwen2.5-1.5B-Instruct"
model_id
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-1.5B-Instruct"
)
# When set generation_config to "vllm", the default generation config
# When set generation_config to "vllm", the default generation config
# will not be loaded.
# will not be loaded.
...
...
tests/test_regression.py
View file @
bd363067
...
@@ -5,6 +5,7 @@ It should include tests that are reported by users and making sure they
...
@@ -5,6 +5,7 @@ It should include tests that are reported by users and making sure they
will never happen again.
will never happen again.
"""
"""
import
os
import
gc
import
gc
import
pytest
import
pytest
...
@@ -12,8 +13,9 @@ import torch
...
@@ -12,8 +13,9 @@ import torch
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
utils
import
models_path_prefix
from
.utils
import
models_path_prefix
import
os
from
vllm.utils
import
SUPPORT_TC
,
gpuname
import
vllm.envs
as
envs
@
pytest
.
mark
.
skip
(
reason
=
"In V1, we reject tokens > max_seq_len"
)
@
pytest
.
mark
.
skip
(
reason
=
"In V1, we reject tokens > max_seq_len"
)
...
@@ -23,7 +25,7 @@ def test_duplicated_ignored_sequence_group():
...
@@ -23,7 +25,7 @@ def test_duplicated_ignored_sequence_group():
sampling_params
=
SamplingParams
(
temperature
=
0.01
,
sampling_params
=
SamplingParams
(
temperature
=
0.01
,
top_p
=
0.1
,
top_p
=
0.1
,
max_tokens
=
256
)
max_tokens
=
256
)
llm
=
LLM
(
model
=
"distilbert/distilgpt2"
,
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
)
,
max_num_batched_tokens
=
4096
,
max_num_batched_tokens
=
4096
,
tensor_parallel_size
=
1
)
tensor_parallel_size
=
1
)
prompts
=
[
"This is a short prompt"
,
"This is a very long prompt "
*
1000
]
prompts
=
[
"This is a short prompt"
,
"This is a very long prompt "
*
1000
]
...
@@ -36,7 +38,13 @@ def test_max_tokens_none():
...
@@ -36,7 +38,13 @@ def test_max_tokens_none():
sampling_params
=
SamplingParams
(
temperature
=
0.01
,
sampling_params
=
SamplingParams
(
temperature
=
0.01
,
top_p
=
0.1
,
top_p
=
0.1
,
max_tokens
=
None
)
max_tokens
=
None
)
llm
=
LLM
(
model
=
"distilbert/distilgpt2"
,
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
:
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
max_num_batched_tokens
=
4096
,
tensor_parallel_size
=
1
,
block_size
=
64
)
else
:
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
max_num_batched_tokens
=
4096
,
max_num_batched_tokens
=
4096
,
tensor_parallel_size
=
1
)
tensor_parallel_size
=
1
)
prompts
=
[
"Just say hello!"
]
prompts
=
[
"Just say hello!"
]
...
@@ -46,7 +54,7 @@ def test_max_tokens_none():
...
@@ -46,7 +54,7 @@ def test_max_tokens_none():
def
test_gc
():
def
test_gc
():
llm
=
LLM
(
model
=
"distilbert/distilgpt2"
,
enforce_eager
=
True
)
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
)
,
enforce_eager
=
True
)
del
llm
del
llm
gc
.
collect
()
gc
.
collect
()
...
@@ -63,6 +71,9 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
...
@@ -63,6 +71,9 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
# model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
# model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_MODELSCOPE"
,
"True"
)
m
.
setenv
(
"VLLM_USE_MODELSCOPE"
,
"True"
)
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
:
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"qwen/Qwen1.5-0.5B-Chat"
),
block_size
=
64
)
else
:
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"qwen/Qwen1.5-0.5B-Chat"
))
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"qwen/Qwen1.5-0.5B-Chat"
))
prompts
=
[
prompts
=
[
...
...
tests/test_sampling_params.py
View file @
bd363067
...
@@ -2,13 +2,15 @@
...
@@ -2,13 +2,15 @@
"""Tests for the SamplingParams class.
"""Tests for the SamplingParams class.
"""
"""
import
os
import
pytest
import
pytest
from
vllm
import
SamplingParams
from
vllm
import
SamplingParams
from
vllm.config
import
ModelConfig
from
vllm.config
import
ModelConfig
from
vllm.entrypoints.openai.protocol
import
ChatCompletionRequest
from
vllm.entrypoints.openai.protocol
import
ChatCompletionRequest
from
utils
import
models_path_prefix
MODEL_NAME
=
"Qwen/Qwen1.5-7B"
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen1.5-7B"
)
def
test_max_tokens_none
():
def
test_max_tokens_none
():
...
...
tests/test_utils.py
View file @
bd363067
...
@@ -8,6 +8,7 @@ import socket
...
@@ -8,6 +8,7 @@ import socket
from
collections.abc
import
AsyncIterator
from
collections.abc
import
AsyncIterator
from
unittest.mock
import
patch
from
unittest.mock
import
patch
import
os
import
pytest
import
pytest
import
torch
import
torch
from
vllm_test_utils.monitor
import
monitor
from
vllm_test_utils.monitor
import
monitor
...
...
tests/tool_use/utils.py
View file @
bd363067
...
@@ -66,7 +66,7 @@ CONFIGS: dict[str, ServerConfig] = {
...
@@ -66,7 +66,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
},
"llama"
:
{
"llama"
:
{
"model"
:
"model"
:
"meta-llama/Meta-Llama-3.1-8B-Instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Meta-Llama-3.1-8B-Instruct"
)
,
"arguments"
:
[
"arguments"
:
[
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--tool-call-parser"
,
"llama3_json"
,
"--chat-template"
,
"--tool-call-parser"
,
"llama3_json"
,
"--chat-template"
,
...
@@ -77,7 +77,7 @@ CONFIGS: dict[str, ServerConfig] = {
...
@@ -77,7 +77,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
},
"llama3.2"
:
{
"llama3.2"
:
{
"model"
:
"model"
:
"meta-llama/Llama-3.2-3B-Instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-3B-Instruct"
)
,
"arguments"
:
[
"arguments"
:
[
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--tool-call-parser"
,
"llama3_json"
,
"--chat-template"
,
"--tool-call-parser"
,
"llama3_json"
,
"--chat-template"
,
...
@@ -88,7 +88,7 @@ CONFIGS: dict[str, ServerConfig] = {
...
@@ -88,7 +88,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
},
"llama4"
:
{
"llama4"
:
{
"model"
:
"model"
:
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
)
,
"arguments"
:
[
"arguments"
:
[
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--tool-call-parser"
,
"pythonic"
,
"--chat-template"
,
"--tool-call-parser"
,
"pythonic"
,
"--chat-template"
,
...
@@ -103,7 +103,7 @@ CONFIGS: dict[str, ServerConfig] = {
...
@@ -103,7 +103,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
},
"llama4_json"
:
{
"llama4_json"
:
{
"model"
:
"model"
:
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
)
,
"arguments"
:
[
"arguments"
:
[
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"-tp"
,
"4"
,
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"-tp"
,
"4"
,
"--distributed-executor-backend"
,
"mp"
,
"--tool-call-parser"
,
"--distributed-executor-backend"
,
"mp"
,
"--tool-call-parser"
,
...
@@ -149,7 +149,7 @@ CONFIGS: dict[str, ServerConfig] = {
...
@@ -149,7 +149,7 @@ CONFIGS: dict[str, ServerConfig] = {
# },
# },
"granite-3.0-8b"
:
{
"granite-3.0-8b"
:
{
"model"
:
"model"
:
"ibm-granite/granite-3.0-8b-instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"ibm-granite/granite-3.0-8b-instruct"
)
,
"arguments"
:
[
"arguments"
:
[
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--tool-call-parser"
,
"granite"
,
"--chat-template"
,
"--tool-call-parser"
,
"granite"
,
"--chat-template"
,
...
@@ -158,7 +158,7 @@ CONFIGS: dict[str, ServerConfig] = {
...
@@ -158,7 +158,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
},
"granite-3.1-8b"
:
{
"granite-3.1-8b"
:
{
"model"
:
"model"
:
"ibm-granite/granite-3.1-8b-instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"ibm-granite/granite-3.1-8b-instruct"
)
,
"arguments"
:
[
"arguments"
:
[
"--enforce-eager"
,
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--no-enable-prefix-caching"
,
...
@@ -170,7 +170,7 @@ CONFIGS: dict[str, ServerConfig] = {
...
@@ -170,7 +170,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
},
"internlm"
:
{
"internlm"
:
{
"model"
:
"model"
:
"internlm/internlm2_5-7b-chat"
,
os
.
path
.
join
(
models_path_prefix
,
"internlm/internlm2_5-7b-chat"
)
,
"arguments"
:
[
"arguments"
:
[
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--tool-call-parser"
,
"internlm"
,
"--chat-template"
,
"--tool-call-parser"
,
"internlm"
,
"--chat-template"
,
...
@@ -183,7 +183,7 @@ CONFIGS: dict[str, ServerConfig] = {
...
@@ -183,7 +183,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
},
"toolACE"
:
{
"toolACE"
:
{
"model"
:
"model"
:
"Team-ACE/ToolACE-8B"
,
os
.
path
.
join
(
models_path_prefix
,
"Team-ACE/ToolACE-8B"
)
,
"arguments"
:
[
"arguments"
:
[
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--tool-call-parser"
,
"pythonic"
,
"--chat-template"
,
"--tool-call-parser"
,
"pythonic"
,
"--chat-template"
,
...
...
tests/tpu/test_quantization_accuracy.py
→
tests/tpu/
un
test_quantization_accuracy.py
View file @
bd363067
...
@@ -4,6 +4,8 @@ from dataclasses import dataclass
...
@@ -4,6 +4,8 @@ from dataclasses import dataclass
import
lm_eval
import
lm_eval
import
pytest
import
pytest
import
os
from
..utils
import
models_path_prefix
TASK
=
"gsm8k"
TASK
=
"gsm8k"
FILTER
=
"exact_match,strict-match"
FILTER
=
"exact_match,strict-match"
...
@@ -23,7 +25,7 @@ class GSM8KAccuracyTestConfig:
...
@@ -23,7 +25,7 @@ class GSM8KAccuracyTestConfig:
# NOTE: Accuracy scores measured on GPUs.
# NOTE: Accuracy scores measured on GPUs.
ACCURACY_CONFIGS
=
[
ACCURACY_CONFIGS
=
[
GSM8KAccuracyTestConfig
(
GSM8KAccuracyTestConfig
(
model_name
=
"neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
model_name
=
os
.
path
.
join
(
models_path_prefix
,
"neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
)
,
excepted_value
=
0.76
),
# no bias
excepted_value
=
0.76
),
# no bias
# NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
# NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
# so only one of these tests can run in a single call to pytest. As
# so only one of these tests can run in a single call to pytest. As
...
...
tests/v1/core/__init__.py
0 → 100644
View file @
bd363067
tests/v1/core/test_kv_cache_utils.py
View file @
bd363067
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
os
import
pytest
import
pytest
import
torch
import
torch
...
@@ -22,6 +23,7 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
...
@@ -22,6 +23,7 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
KVCacheGroupSpec
,
KVCacheTensor
)
KVCacheGroupSpec
,
KVCacheTensor
)
from
vllm.v1.metrics.stats
import
PrefixCacheStats
from
vllm.v1.metrics.stats
import
PrefixCacheStats
from
vllm.v1.request
import
Request
from
vllm.v1.request
import
Request
from
...utils
import
models_path_prefix
# yapf: enable
# yapf: enable
...
@@ -432,8 +434,8 @@ def test_unify_kv_cache_configs():
...
@@ -432,8 +434,8 @@ def test_unify_kv_cache_configs():
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
(
"model_id"
,
"max_model_len"
,
"want_estimated_max_len"
),
[
(
"model_id"
,
"max_model_len"
,
"want_estimated_max_len"
),
[
(
"Qwen/Qwen1.5-7B"
,
16385
,
16384
),
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen1.5-7B"
)
,
16385
,
16384
),
(
"Qwen/Qwen1.5-7B"
,
16383
,
16383
),
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen1.5-7B"
)
,
16383
,
16383
),
])
])
def
test_estimate_max_model_len
(
model_id
,
max_model_len
,
def
test_estimate_max_model_len
(
model_id
,
max_model_len
,
want_estimated_max_len
):
want_estimated_max_len
):
...
...
tests/v1/core/test_scheduler.py
View file @
bd363067
...
@@ -2,6 +2,7 @@
...
@@ -2,6 +2,7 @@
from
typing
import
Optional
from
typing
import
Optional
from
unittest.mock
import
Mock
from
unittest.mock
import
Mock
import
os
import
pytest
import
pytest
import
torch
import
torch
...
@@ -16,12 +17,13 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
...
@@ -16,12 +17,13 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
from
vllm.v1.outputs
import
ModelRunnerOutput
from
vllm.v1.outputs
import
ModelRunnerOutput
from
vllm.v1.request
import
Request
,
RequestStatus
from
vllm.v1.request
import
Request
,
RequestStatus
from
vllm.v1.structured_output
import
StructuredOutputManager
from
vllm.v1.structured_output
import
StructuredOutputManager
from
...utils
import
models_path_prefix
EOS_TOKEN_ID
=
50256
EOS_TOKEN_ID
=
50256
def
create_scheduler
(
def
create_scheduler
(
model
:
str
=
"facebook/opt-125m"
,
model
:
str
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
max_num_seqs
:
int
=
16
,
max_num_seqs
:
int
=
16
,
max_num_batched_tokens
:
int
=
8192
,
max_num_batched_tokens
:
int
=
8192
,
enable_prefix_caching
:
Optional
[
bool
]
=
None
,
enable_prefix_caching
:
Optional
[
bool
]
=
None
,
...
@@ -211,7 +213,7 @@ def test_schedule(enable_prefix_caching: Optional[bool],
...
@@ -211,7 +213,7 @@ def test_schedule(enable_prefix_caching: Optional[bool],
def
test_schedule_multimodal_requests
():
def
test_schedule_multimodal_requests
():
scheduler
=
create_scheduler
(
model
=
"llava-hf/llava-1.5-7b-hf"
)
scheduler
=
create_scheduler
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-1.5-7b-hf"
)
)
mm_positions
=
[[
PlaceholderRange
(
offset
=
i
,
length
=
100
)]
mm_positions
=
[[
PlaceholderRange
(
offset
=
i
,
length
=
100
)]
for
i
in
range
(
10
)]
for
i
in
range
(
10
)]
requests
=
create_requests
(
requests
=
create_requests
(
...
@@ -243,7 +245,7 @@ def test_schedule_partial_requests():
...
@@ -243,7 +245,7 @@ def test_schedule_partial_requests():
there is insufficient encoder budget.
there is insufficient encoder budget.
"""
"""
scheduler
=
create_scheduler
(
scheduler
=
create_scheduler
(
model
=
"llava-hf/llava-1.5-7b-hf"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-1.5-7b-hf"
)
,
max_num_batched_tokens
=
1024
,
max_num_batched_tokens
=
1024
,
)
)
mm_positions
=
[[
PlaceholderRange
(
offset
=
100
,
length
=
600
)]
mm_positions
=
[[
PlaceholderRange
(
offset
=
100
,
length
=
600
)]
...
@@ -303,7 +305,7 @@ def test_schedule_partial_requests():
...
@@ -303,7 +305,7 @@ def test_schedule_partial_requests():
def
test_no_mm_input_chunking
():
def
test_no_mm_input_chunking
():
# Disable multimodal input chunking.
# Disable multimodal input chunking.
scheduler
=
create_scheduler
(
scheduler
=
create_scheduler
(
model
=
"llava-hf/llava-1.5-7b-hf"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-1.5-7b-hf"
)
,
max_num_batched_tokens
=
1024
,
max_num_batched_tokens
=
1024
,
disable_chunked_mm_input
=
True
,
disable_chunked_mm_input
=
True
,
max_model_len
=
2048
,
max_model_len
=
2048
,
...
@@ -347,7 +349,7 @@ def test_no_mm_input_chunking():
...
@@ -347,7 +349,7 @@ def test_no_mm_input_chunking():
# of a max_num_batched_tokens for the mm input.
# of a max_num_batched_tokens for the mm input.
with
pytest
.
raises
(
ValueError
):
with
pytest
.
raises
(
ValueError
):
_
=
create_scheduler
(
_
=
create_scheduler
(
model
=
"llava-hf/llava-1.5-7b-hf"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-1.5-7b-hf"
)
,
max_num_batched_tokens
=
100
,
max_num_batched_tokens
=
100
,
disable_chunked_mm_input
=
True
,
disable_chunked_mm_input
=
True
,
)
)
...
@@ -362,7 +364,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
...
@@ -362,7 +364,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
"""
"""
scheduler
=
create_scheduler
(
scheduler
=
create_scheduler
(
model
=
"facebook/opt-125m"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
max_num_batched_tokens
=
1024
,
max_num_batched_tokens
=
1024
,
long_prefill_token_threshold
=
400
,
long_prefill_token_threshold
=
400
,
enable_prefix_caching
=
enable_prefix_caching
,
enable_prefix_caching
=
enable_prefix_caching
,
...
...
tests/v1/core/test_scheduler_e2e.py
View file @
bd363067
...
@@ -4,11 +4,12 @@ import os
...
@@ -4,11 +4,12 @@ import os
import
pytest
import
pytest
from
vllm
import
LLM
from
vllm
import
LLM
from
...utils
import
models_path_prefix
if
os
.
getenv
(
"VLLM_USE_V1"
,
"0"
)
!=
"1"
:
if
os
.
getenv
(
"VLLM_USE_V1"
,
"0"
)
!=
"1"
:
pytest
.
skip
(
"Test package requires V1"
,
allow_module_level
=
True
)
pytest
.
skip
(
"Test package requires V1"
,
allow_module_level
=
True
)
MODEL
=
"meta-llama/Llama-3.2-1B"
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B"
)
PROMPT
=
"Hello my name is Robert and I"
PROMPT
=
"Hello my name is Robert and I"
...
...
tests/v1/shutdown/__init__.py
0 → 100644
View file @
bd363067
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment