Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
9c4ecf15
Commit
9c4ecf15
authored
Apr 14, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.4' into v0.8.4-ori
parents
bfc2d6f7
dc1b4a6f
Changes
342
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
626 additions
and
86 deletions
+626
-86
tests/test_sharded_state_loader.py
tests/test_sharded_state_loader.py
+10
-9
tests/tool_use/conftest.py
tests/tool_use/conftest.py
+24
-1
tests/tool_use/utils.py
tests/tool_use/utils.py
+16
-0
tests/tpu/test_compilation.py
tests/tpu/test_compilation.py
+10
-4
tests/v1/core/test_kv_cache_utils.py
tests/v1/core/test_kv_cache_utils.py
+136
-42
tests/v1/core/test_scheduler.py
tests/v1/core/test_scheduler.py
+54
-0
tests/v1/e2e/test_spec_decode.py
tests/v1/e2e/test_spec_decode.py
+49
-0
tests/v1/engine/test_engine_args.py
tests/v1/engine/test_engine_args.py
+4
-2
tests/v1/engine/test_engine_core_client.py
tests/v1/engine/test_engine_core_client.py
+41
-0
tests/v1/entrypoints/llm/test_struct_output_generate.py
tests/v1/entrypoints/llm/test_struct_output_generate.py
+39
-0
tests/v1/structured_output/test_utils.py
tests/v1/structured_output/test_utils.py
+8
-8
tests/v1/test_serial_utils.py
tests/v1/test_serial_utils.py
+94
-0
tests/v1/tpu/test_pallas.py
tests/v1/tpu/test_pallas.py
+4
-8
tests/v1/tpu/test_sampler.py
tests/v1/tpu/test_sampler.py
+5
-0
tests/v1/tpu/worker/test_tpu_model_runner.py
tests/v1/tpu/worker/test_tpu_model_runner.py
+19
-6
tools/update-dockerfile-graph.sh
tools/update-dockerfile-graph.sh
+78
-0
vllm/_custom_ops.py
vllm/_custom_ops.py
+11
-0
vllm/attention/backends/flash_attn.py
vllm/attention/backends/flash_attn.py
+6
-1
vllm/attention/backends/flashinfer.py
vllm/attention/backends/flashinfer.py
+8
-0
vllm/attention/backends/hpu_attn.py
vllm/attention/backends/hpu_attn.py
+10
-5
No files found.
tests/test_sharded_state_loader.py
View file @
9c4ecf15
...
...
@@ -47,12 +47,10 @@ def test_filter_subtensors():
@
pytest
.
fixture
(
scope
=
"module"
)
def
llama_3p2_1b_files
():
with
TemporaryDirectory
()
as
cache_dir
:
input_dir
=
snapshot_download
(
"meta-llama/Llama-3.2-1B-Instruct"
,
cache_dir
=
cache_dir
,
ignore_patterns
=
[
"*.bin*"
,
"original/*"
])
input_dir
=
snapshot_download
(
"meta-llama/Llama-3.2-1B-Instruct"
,
ignore_patterns
=
[
"*.bin*"
,
"original/*"
])
yield
input_dir
yield
input_dir
def
_run_writer
(
input_dir
,
output_dir
,
weights_patterns
,
**
kwargs
):
...
...
@@ -64,9 +62,9 @@ def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
# Copy metadata files to output directory
for
file
in
os
.
listdir
(
input_dir
):
if
not
any
(
file
.
endswith
(
ext
)
and
not
os
.
path
.
isdir
(
file
)
for
ext
in
weights_patterns
):
if
os
.
path
.
isdir
(
os
.
path
.
join
(
input_dir
,
file
)):
continue
if
not
any
(
file
.
endswith
(
ext
)
for
ext
in
weights_patterns
):
shutil
.
copy
(
f
"
{
input_dir
}
/
{
file
}
"
,
output_dir
)
...
...
@@ -81,7 +79,8 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
@
pytest
.
mark
.
parametrize
(
"enable_lora"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
,
2
])
def
test_sharded_state_loader
(
enable_lora
,
tp_size
,
num_gpus_available
,
llama_3p2_1b_files
):
llama_3p2_1b_files
,
monkeypatch
:
pytest
.
MonkeyPatch
):
if
num_gpus_available
<
tp_size
:
pytest
.
skip
(
f
"Not enough GPUs for tensor parallelism
{
tp_size
}
"
)
...
...
@@ -89,6 +88,8 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
gpu_memory_utilization
=
0.8
input_dir
=
llama_3p2_1b_files
ctx
=
mp
.
get_context
(
"spawn"
)
# The interface in v1 engine has changed, run in v1 engine will hang.
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
# Run in separate processes for memory & CUDA isolation
with
TemporaryDirectory
()
as
output_dir
:
...
...
tests/tool_use/conftest.py
View file @
9c4ecf15
...
...
@@ -10,10 +10,33 @@ from vllm.platforms import current_platform
from
.utils
import
ARGS
,
CONFIGS
,
ServerConfig
# select models to test based on command line arguments
def
pytest_addoption
(
parser
):
parser
.
addoption
(
"--models"
,
nargs
=
"+"
,
help
=
"Specify one or more models to test"
)
parser
.
addoption
(
"--extended"
,
action
=
"store_true"
,
default
=
False
,
help
=
"invoke extended tests requiring large GPUs"
)
# for each server config, download the model and return the config
@
pytest
.
fixture
(
scope
=
"session"
,
params
=
CONFIGS
.
keys
())
def
server_config
(
request
):
config
=
CONFIGS
[
request
.
param
]
extended
=
request
.
config
.
getoption
(
"--extended"
)
models
=
request
.
config
.
getoption
(
"--models"
)
config_keys_to_test
=
[
key
for
key
in
CONFIGS
if
(
models
is
None
or
key
in
models
)
and
(
extended
or
not
CONFIGS
[
key
].
get
(
"extended"
,
False
))
]
config_key
=
request
.
param
if
config_key
not
in
config_keys_to_test
:
pytest
.
skip
(
f
"Skipping config '
{
config_key
}
'"
)
config
=
CONFIGS
[
config_key
]
if
current_platform
.
is_rocm
()
and
not
config
.
get
(
"supports_rocm"
,
True
):
pytest
.
skip
(
"The {} model can't be tested on the ROCm platform"
.
format
(
...
...
tests/tool_use/utils.py
View file @
9c4ecf15
...
...
@@ -16,6 +16,7 @@ class ServerConfig(TypedDict, total=False):
system_prompt
:
Optional
[
str
]
supports_parallel
:
Optional
[
bool
]
supports_rocm
:
Optional
[
bool
]
extended
:
Optional
[
bool
]
# tests do not run in CI automatically
def
patch_system_prompt
(
messages
:
list
[
dict
[
str
,
Any
]],
...
...
@@ -82,6 +83,21 @@ CONFIGS: dict[str, ServerConfig] = {
"supports_parallel"
:
False
,
},
"llama4"
:
{
"model"
:
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
"arguments"
:
[
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--tool-call-parser"
,
"pythonic"
,
"--chat-template"
,
str
(
VLLM_PATH
/
"examples/tool_chat_template_llama4_pythonic.jinja"
),
"-tp"
,
"4"
],
"supports_parallel"
:
False
,
"extended"
:
True
},
"mistral"
:
{
"model"
:
"mistralai/Mistral-7B-Instruct-v0.3"
,
...
...
tests/tpu/test_compilation.py
View file @
9c4ecf15
...
...
@@ -44,7 +44,7 @@ def test_tpu_compilation():
assert
generated_text
.
startswith
(
answer
)
compiled_codes
=
sorted
(
glob
.
glob
(
os
.
path
.
join
(
temp_dir
,
"__transformed_code*.py"
)))
glob
.
glob
(
os
.
path
.
join
(
temp_dir
,
"__transformed_code*
for_forward
.py"
)))
for
i
,
compiled_code
in
enumerate
(
compiled_codes
):
print
(
"{} file: {}"
.
format
(
i
+
1
,
compiled_code
))
...
...
@@ -52,15 +52,21 @@ def test_tpu_compilation():
# We should only trigger Dynamo compilation 2 times:
# 1. Forward pass without kv_caches
# 2. Forward pass with kv_caches
# Check we have
4
compiled codes
# Check we have
2
compiled codes
assert
len
(
compiled_codes
)
==
2
kv_cache_prefix
=
"kv_cache"
attn_prefix
=
"ragged_paged_attention"
def
extract_compiled_index
(
s
):
parts
=
s
.
replace
(
"."
,
"_"
).
split
(
"_"
)
numbers
=
[
int
(
part
)
for
part
in
parts
if
part
.
isdigit
()]
return
numbers
[
0
]
# Check all the compilations are as expected
compiled_fns
=
sorted
(
glob
.
glob
(
os
.
path
.
join
(
temp_dir
,
"__compiled_fn*Captured*.py"
)))
compiled_fns
=
sorted
(
glob
.
glob
(
os
.
path
.
join
(
temp_dir
,
"__compiled_fn*Captured*.py"
)),
key
=
lambda
s
:
extract_compiled_index
(
s
))
for
i
,
compiled_fn
in
enumerate
(
compiled_fns
):
print
(
"{} file: {}"
.
format
(
i
+
1
,
compiled_fn
))
...
...
tests/v1/core/test_kv_cache_utils.py
View file @
9c4ecf15
...
...
@@ -3,14 +3,17 @@
import
pytest
import
torch
from
vllm.multimodal.inputs
import
MultiModalKwargs
from
vllm.config
import
ModelConfig
,
SchedulerConfig
,
VllmConfig
from
vllm.multimodal.inputs
import
MultiModalKwargs
,
PlaceholderRange
from
vllm.sampling_params
import
SamplingParams
from
vllm.utils
import
sha256
from
vllm.utils
import
GiB_bytes
,
sha256
from
vllm.v1.core.kv_cache_manager
import
KVCacheManager
# disable yapf here as it formats differently than isort such that both fail
# yapf: disable
from
vllm.v1.core.kv_cache_utils
import
(
NONE_HASH
,
BlockHashType
,
FreeKVCacheBlockQueue
,
KVCacheBlock
,
PrefixCachingMetrics
,
estimate_max_model_len
,
generate_block_hash_extra_keys
,
hash_block_tokens
,
hash_request_tokens
,
...
...
@@ -46,6 +49,18 @@ def make_request(request_id,
)
def
new_kv_cache_spec
(
block_size
=
16
,
num_kv_heads
=
2
,
head_size
=
64
,
dtype
=
torch
.
float32
,
use_mla
=
False
):
return
FullAttentionSpec
(
block_size
=
block_size
,
num_kv_heads
=
num_kv_heads
,
head_size
=
head_size
,
dtype
=
dtype
,
use_mla
=
use_mla
)
def
test_none_hash
():
assert
NONE_HASH
is
not
None
assert
isinstance
(
NONE_HASH
,
int
)
...
...
@@ -158,13 +173,10 @@ def test_generate_block_hash_extra_keys():
request
=
make_request
(
request_id
=
0
,
prompt_token_ids
=
[
_
for
_
in
range
(
20
)],
mm_positions
=
[{
"offset"
:
0
,
"length"
:
5
},
{
"offset"
:
10
,
"length"
:
5
}],
mm_positions
=
[
PlaceholderRange
(
offset
=
0
,
length
=
5
),
PlaceholderRange
(
offset
=
10
,
length
=
5
),
],
mm_hashes
=
[
"hash1"
,
"hash2"
],
)
...
...
@@ -222,13 +234,10 @@ def test_hash_request_tokens(hash_fn):
request
=
make_request
(
request_id
=
0
,
prompt_token_ids
=
[
_
for
_
in
range
(
6
)],
mm_positions
=
[{
"offset"
:
0
,
"length"
:
3
},
{
"offset"
:
3
,
"length"
:
3
}],
mm_positions
=
[
PlaceholderRange
(
offset
=
0
,
length
=
3
),
PlaceholderRange
(
offset
=
3
,
length
=
3
),
],
mm_hashes
=
[
"hash1"
,
"hash2"
],
)
...
...
@@ -253,25 +262,19 @@ def test_hash_tokens_different_mm_input(hash_fn):
request1
=
make_request
(
request_id
=
0
,
prompt_token_ids
=
[
_
for
_
in
range
(
6
)],
mm_positions
=
[{
"offset"
:
0
,
"length"
:
3
},
{
"offset"
:
3
,
"length"
:
3
}],
mm_positions
=
[
PlaceholderRange
(
offset
=
0
,
length
=
3
),
PlaceholderRange
(
offset
=
3
,
length
=
3
),
],
mm_hashes
=
[
"hash1"
,
"hash2"
],
)
request2
=
make_request
(
request_id
=
1
,
prompt_token_ids
=
[
_
for
_
in
range
(
6
)],
mm_positions
=
[{
"offset"
:
0
,
"length"
:
3
},
{
"offset"
:
3
,
"length"
:
3
}],
mm_positions
=
[
PlaceholderRange
(
offset
=
0
,
length
=
3
),
PlaceholderRange
(
offset
=
3
,
length
=
3
),
],
mm_hashes
=
[
"hash3"
,
"hash2"
],
)
block_size
=
3
...
...
@@ -337,18 +340,6 @@ def test_metrics():
def
test_unify_kv_cache_configs
():
def
new_kv_cache_spec
(
block_size
=
16
,
num_kv_heads
=
2
,
head_size
=
64
,
dtype
=
torch
.
float32
,
use_mla
=
False
):
return
FullAttentionSpec
(
block_size
=
block_size
,
num_kv_heads
=
num_kv_heads
,
head_size
=
head_size
,
dtype
=
dtype
,
use_mla
=
use_mla
)
same_kv_cache_config
=
[
KVCacheConfig
(
num_blocks
=
10
,
...
...
@@ -438,3 +429,106 @@ def test_unify_kv_cache_configs():
]
with
pytest
.
raises
(
AssertionError
):
unify_kv_cache_configs
(
diff_kv_cache_config
)
@
pytest
.
mark
.
parametrize
(
(
"model_id"
,
"max_model_len"
,
"want_estimated_max_len"
),
[
(
"Qwen/Qwen1.5-7B"
,
16385
,
16384
),
(
"Qwen/Qwen1.5-7B"
,
16383
,
16383
),
])
def
test_estimate_max_model_len
(
model_id
,
max_model_len
,
want_estimated_max_len
):
# Create a VllmConfig
model_config
=
ModelConfig
(
model_id
,
task
=
"generate"
,
tokenizer
=
model_id
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
seed
=
0
,
dtype
=
"float16"
,
max_model_len
=
max_model_len
,
)
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
=
32768
)
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
scheduler_config
=
scheduler_config
,
)
# Create KV cache specs
kv_cache_spec
=
{}
for
i
in
range
(
32
):
layer_name
=
f
"layer_
{
i
}
"
kv_cache_spec
[
layer_name
]
=
FullAttentionSpec
(
block_size
=
16
,
num_kv_heads
=
32
,
head_size
=
128
,
dtype
=
torch
.
float16
,
use_mla
=
False
,
)
# Estimate the maximum model length, 16384 model_len need 8GB
estimated_max_len
=
estimate_max_model_len
(
vllm_config
,
kv_cache_spec
,
8
*
GiB_bytes
)
assert
estimated_max_len
==
want_estimated_max_len
def
test_allocate_with_lookahead
():
"""Verify that lookahead tokens correctly affect block allocation"""
block_size
=
4
config
=
KVCacheConfig
(
num_blocks
=
10
,
tensors
=
{
"layer1"
:
KVCacheTensor
(
100
),
},
kv_cache_groups
=
[
KVCacheGroupSpec
([
"layer1"
],
new_kv_cache_spec
(
block_size
=
block_size
)),
],
)
request
=
make_request
(
request_id
=
0
,
prompt_token_ids
=
[],
mm_positions
=
None
,
mm_hashes
=
None
,
)
# Test case 1: Requires additional lookahead tokens
kv_cache_manager
=
KVCacheManager
(
kv_cache_config
=
config
,
max_model_len
=
100
,
num_preallocate_tokens
=
0
)
blocks
=
kv_cache_manager
.
allocate_slots
(
request
,
num_tokens
=
3
,
num_lookahead_tokens
=
2
,
# Total required: 3+2=5 tokens
)
assert
len
(
blocks
)
==
2
# ceil(5/4)=2 blocks
# Test case 2: With precomputed blocks
kv_cache_manager
=
KVCacheManager
(
kv_cache_config
=
config
,
max_model_len
=
100
,
num_preallocate_tokens
=
4
)
# num_preallocate_blocks = 4 // 4 - 2 // 4 = 1
# required_blocks = ceil((3 + 2) /4) = 2
# total_blocks = 1 + 2 = 3
blocks
=
kv_cache_manager
.
allocate_slots
(
request
,
num_tokens
=
3
,
num_lookahead_tokens
=
2
,
)
assert
len
(
blocks
)
==
3
# Test case 3: With precomputed blocks
# num_preallocate_blocks = 4 // 4 - 4 // 4 = 0
# required_blocks = ceil((3 + 4) / 4) = 2
# total_blocks = 0 + 2 = 2
kv_cache_manager
=
KVCacheManager
(
kv_cache_config
=
config
,
max_model_len
=
100
,
num_preallocate_tokens
=
4
)
blocks
=
kv_cache_manager
.
allocate_slots
(
request
,
num_tokens
=
3
,
num_lookahead_tokens
=
4
,
)
assert
len
(
blocks
)
==
2
tests/v1/core/test_scheduler.py
View file @
9c4ecf15
...
...
@@ -24,6 +24,7 @@ def create_scheduler(
max_num_batched_tokens
:
int
=
8192
,
enable_prefix_caching
:
Optional
[
bool
]
=
None
,
long_prefill_token_threshold
:
int
=
0
,
disable_chunked_mm_input
:
bool
=
False
,
)
->
Scheduler
:
'''Create scheduler under test.
...
...
@@ -43,6 +44,7 @@ def create_scheduler(
max_num_batched_tokens
=
max_num_batched_tokens
,
max_model_len
=
max_num_batched_tokens
,
long_prefill_token_threshold
=
long_prefill_token_threshold
,
disable_chunked_mm_input
=
disable_chunked_mm_input
,
)
model_config
=
ModelConfig
(
model
=
model
,
...
...
@@ -278,6 +280,58 @@ def test_schedule_partial_requests():
assert
requests
[
2
].
request_id
not
in
output
.
num_scheduled_tokens
def
test_no_mm_input_chunking
():
# Disable multimodal input chunking.
scheduler
=
create_scheduler
(
model
=
"llava-hf/llava-1.5-7b-hf"
,
max_num_batched_tokens
=
1024
,
disable_chunked_mm_input
=
True
,
)
mm_positions
=
[[
PlaceholderRange
(
offset
=
400
,
length
=
800
)]]
requests
=
create_requests
(
num_requests
=
1
,
num_tokens
=
1200
,
mm_positions
=
mm_positions
)
for
request
in
requests
:
scheduler
.
add_request
(
request
)
output
=
scheduler
.
schedule
()
assert
len
(
output
.
scheduled_new_reqs
)
==
1
assert
len
(
output
.
scheduled_cached_reqs
)
==
0
assert
len
(
output
.
finished_req_ids
)
==
0
# We want to only see the 400 text tokens at the start scheduled
assert
output
.
num_scheduled_tokens
[
requests
[
0
].
request_id
]
==
400
req_to_index
=
{
request
.
request_id
:
i
for
i
,
request
in
enumerate
(
requests
)
}
model_runner_output
=
ModelRunnerOutput
(
req_ids
=
[
request
.
request_id
for
request
in
requests
],
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[[]
for
_
in
range
(
len
(
requests
))],
spec_token_ids
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
)
scheduler
.
update_from_output
(
output
,
model_runner_output
)
output
=
scheduler
.
schedule
()
assert
len
(
scheduler
.
running
)
==
1
assert
len
(
output
.
scheduled_new_reqs
)
==
0
assert
len
(
output
.
scheduled_cached_reqs
)
==
1
assert
len
(
output
.
finished_req_ids
)
==
0
assert
output
.
num_scheduled_tokens
[
requests
[
0
].
request_id
]
==
800
# Test that we fail if we disable chunked mm input and use too small
# of a max_num_batched_tokens for the mm input.
with
pytest
.
raises
(
ValueError
):
_
=
create_scheduler
(
model
=
"llava-hf/llava-1.5-7b-hf"
,
max_num_batched_tokens
=
100
,
disable_chunked_mm_input
=
True
,
)
@
pytest
.
mark
.
parametrize
(
"enable_prefix_caching"
,
[
True
,
False
])
def
test_schedule_concurrent_partial_requests
(
enable_prefix_caching
:
bool
):
"""Test scheduling behavior with concurrent partial requests.
...
...
tests/v1/e2e/test_
ngram_
spec_decode.py
→
tests/v1/e2e/test_spec_decode.py
View file @
9c4ecf15
...
...
@@ -53,6 +53,11 @@ def model_name():
return
"meta-llama/Meta-Llama-3-8B-Instruct"
@
pytest
.
fixture
def
eagle_model_name
():
return
"yuhuili/EAGLE-LLaMA3-Instruct-8B"
def
test_ngram_correctness
(
monkeypatch
:
pytest
.
MonkeyPatch
,
test_prompts
:
list
[
list
[
dict
[
str
,
Any
]]],
...
...
@@ -95,3 +100,47 @@ def test_ngram_correctness(
# Upon failure, inspect the outputs to check for inaccuracy.
assert
matches
>
int
(
0.7
*
len
(
ref_outputs
))
del
spec_llm
def
test_eagle_correctness
(
monkeypatch
:
pytest
.
MonkeyPatch
,
test_prompts
:
list
[
list
[
dict
[
str
,
Any
]]],
sampling_config
:
SamplingParams
,
model_name
:
str
,
eagle_model_name
:
str
,
):
'''
Compare the outputs of a original LLM and a speculative LLM
should be the same when using eagle speculative decoding.
'''
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
ref_llm
=
LLM
(
model
=
model_name
,
max_model_len
=
1024
)
ref_outputs
=
ref_llm
.
chat
(
test_prompts
,
sampling_config
)
del
ref_llm
spec_llm
=
LLM
(
model
=
model_name
,
speculative_config
=
{
"method"
:
"eagle"
,
"model"
:
eagle_model_name
,
"num_speculative_tokens"
:
3
,
},
max_model_len
=
1024
,
)
spec_outputs
=
spec_llm
.
chat
(
test_prompts
,
sampling_config
)
matches
=
0
misses
=
0
for
ref_output
,
spec_output
in
zip
(
ref_outputs
,
spec_outputs
):
if
ref_output
.
outputs
[
0
].
text
==
spec_output
.
outputs
[
0
].
text
:
matches
+=
1
else
:
misses
+=
1
print
(
f
"ref_output:
{
ref_output
.
outputs
[
0
].
text
}
"
)
print
(
f
"spec_output:
{
spec_output
.
outputs
[
0
].
text
}
"
)
# Heuristic: expect at least 70% of the prompts to match exactly
# Upon failure, inspect the outputs to check for inaccuracy.
assert
matches
>
int
(
0.7
*
len
(
ref_outputs
))
del
spec_llm
tests/v1/engine/test_engine_args.py
View file @
9c4ecf15
...
...
@@ -64,15 +64,17 @@ def test_defaults_with_usage_context():
# For H100 and H200, we use larger default values.
default_llm_tokens
=
16384
default_server_tokens
=
8192
default_max_num_seqs
=
1024
else
:
default_llm_tokens
=
8192
default_server_tokens
=
2048
default_max_num_seqs
=
256
assert
vllm_config
.
scheduler_config
.
max_num_seqs
==
1024
assert
vllm_config
.
scheduler_config
.
max_num_seqs
==
default_max_num_seqs
assert
vllm_config
.
scheduler_config
.
max_num_batched_tokens
==
default_llm_tokens
# noqa: E501
engine_args
=
EngineArgs
(
model
=
"facebook/opt-125m"
)
vllm_config
=
engine_args
.
create_engine_config
(
UsageContext
.
OPENAI_API_SERVER
)
assert
vllm_config
.
scheduler_config
.
max_num_seqs
==
1024
assert
vllm_config
.
scheduler_config
.
max_num_seqs
==
default_max_num_seqs
assert
vllm_config
.
scheduler_config
.
max_num_batched_tokens
==
default_server_tokens
# noqa: E501
tests/v1/engine/test_engine_core_client.py
View file @
9c4ecf15
...
...
@@ -3,8 +3,10 @@
import
asyncio
import
time
import
uuid
from
threading
import
Thread
from
typing
import
Optional
import
psutil
import
pytest
from
transformers
import
AutoTokenizer
...
...
@@ -245,3 +247,42 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
await
core_client
.
call_utility_async
(
"echo"
,
None
,
"help!"
)
assert
str
(
e_info
.
value
)
==
"Call to echo method failed: help!"
@
pytest
.
mark
.
timeout
(
10
)
def
test_startup_failure
(
monkeypatch
:
pytest
.
MonkeyPatch
):
with
monkeypatch
.
context
()
as
m
,
pytest
.
raises
(
Exception
)
as
e_info
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
engine_args
=
EngineArgs
(
model
=
MODEL_NAME
)
vllm_config
=
engine_args
.
create_engine_config
(
usage_context
=
UsageContext
.
UNKNOWN_CONTEXT
)
executor_class
=
Executor
.
get_class
(
vllm_config
)
# Start another thread to wait for engine core process to start
# and kill it - simulate fatal uncaught process exit.
this_proc
=
psutil
.
Process
()
children_before
=
set
(
this_proc
.
children
())
def
kill_first_child
():
while
True
:
time
.
sleep
(
0.5
)
children
=
set
(
this_proc
.
children
())
-
children_before
if
children
:
child
=
children
.
pop
()
print
(
"Killing child core process"
,
child
.
pid
)
child
.
kill
()
break
Thread
(
target
=
kill_first_child
,
daemon
=
True
).
start
()
_core_client
=
EngineCoreClient
.
make_client
(
multiprocess_mode
=
True
,
asyncio_mode
=
True
,
vllm_config
=
vllm_config
,
executor_class
=
executor_class
,
log_stats
=
True
,
)
assert
"Engine core initialization failed"
in
str
(
e_info
.
value
)
tests/v1/entrypoints/llm/test_struct_output_generate.py
View file @
9c4ecf15
...
...
@@ -325,6 +325,45 @@ def test_structured_output(
output_json
=
json
.
loads
(
generated_text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
json_schema
)
#
# Test 10: Generate structured with minLength and maxLength
#
min_length
=
50
max_length
=
50
json_schema
=
{
"type"
:
"object"
,
"properties"
:
{
"description"
:
{
"type"
:
"string"
,
"maxLength"
:
max_length
,
"minLength"
:
min_length
}
},
"required"
:
[
"description"
]
}
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
max_tokens
=
1000
,
guided_decoding
=
GuidedDecodingParams
(
json
=
json_schema
))
outputs
=
llm
.
generate
(
prompts
=
"Generate a description of a frog using 50 characters."
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
assert
outputs
is
not
None
for
output
in
outputs
:
assert
output
is
not
None
assert
isinstance
(
output
,
RequestOutput
)
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
assert
generated_text
is
not
None
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
output_json
=
json
.
loads
(
generated_text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
json_schema
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
"model_name, tokenizer_mode"
,
...
...
tests/v1/structured_output/test_utils.py
View file @
9c4ecf15
...
...
@@ -13,14 +13,6 @@ def unsupported_string_schemas():
"type"
:
"string"
,
"pattern"
:
"^[a-zA-Z]+$"
},
{
"type"
:
"string"
,
"minLength"
:
1
},
{
"type"
:
"string"
,
"maxLength"
:
100
},
{
"type"
:
"string"
,
"format"
:
"email"
...
...
@@ -164,6 +156,14 @@ def supported_schema():
"type"
:
"string"
,
"enum"
:
[
"sedan"
,
"suv"
,
"truck"
]
},
"short_description"
:
{
"type"
:
"string"
,
"maxLength"
:
50
},
"long_description"
:
{
"type"
:
"string"
,
"minLength"
:
50
},
"address"
:
{
"type"
:
"object"
,
"properties"
:
{
...
...
tests/v1/test_serial_utils.py
0 → 100644
View file @
9c4ecf15
# SPDX-License-Identifier: Apache-2.0
from
collections
import
UserDict
from
dataclasses
import
dataclass
import
numpy
as
np
import
torch
from
vllm.v1.serial_utils
import
MsgpackDecoder
,
MsgpackEncoder
class
UnrecognizedType
(
UserDict
):
def
__init__
(
self
,
an_int
:
int
):
super
().
__init__
()
self
.
an_int
=
an_int
@
dataclass
class
MyType
:
tensor1
:
torch
.
Tensor
a_string
:
str
list_of_tensors
:
list
[
torch
.
Tensor
]
numpy_array
:
np
.
ndarray
unrecognized
:
UnrecognizedType
small_f_contig_tensor
:
torch
.
Tensor
large_f_contig_tensor
:
torch
.
Tensor
small_non_contig_tensor
:
torch
.
Tensor
large_non_contig_tensor
:
torch
.
Tensor
def
test_encode_decode
():
"""Test encode/decode loop with zero-copy tensors."""
obj
=
MyType
(
tensor1
=
torch
.
randint
(
low
=
0
,
high
=
100
,
size
=
(
1024
,
),
dtype
=
torch
.
int32
),
a_string
=
"hello"
,
list_of_tensors
=
[
torch
.
rand
((
1
,
10
),
dtype
=
torch
.
float32
),
torch
.
rand
((
3
,
5
,
4000
),
dtype
=
torch
.
float64
),
torch
.
tensor
(
1984
),
# test scalar too
],
numpy_array
=
np
.
arange
(
512
),
unrecognized
=
UnrecognizedType
(
33
),
small_f_contig_tensor
=
torch
.
rand
(
5
,
4
).
t
(),
large_f_contig_tensor
=
torch
.
rand
(
1024
,
4
).
t
(),
small_non_contig_tensor
=
torch
.
rand
(
2
,
4
)[:,
1
:
3
],
large_non_contig_tensor
=
torch
.
rand
(
1024
,
512
)[:,
10
:
20
],
)
encoder
=
MsgpackEncoder
()
decoder
=
MsgpackDecoder
(
MyType
)
encoded
=
encoder
.
encode
(
obj
)
# There should be the main buffer + 4 large tensor buffers
# + 1 large numpy array. "large" is <= 512 bytes.
# The two small tensors are encoded inline.
assert
len
(
encoded
)
==
6
decoded
:
MyType
=
decoder
.
decode
(
encoded
)
assert_equal
(
decoded
,
obj
)
# Test encode_into case
preallocated
=
bytearray
()
encoded2
=
encoder
.
encode_into
(
obj
,
preallocated
)
assert
len
(
encoded2
)
==
6
assert
encoded2
[
0
]
is
preallocated
decoded2
:
MyType
=
decoder
.
decode
(
encoded2
)
assert_equal
(
decoded2
,
obj
)
def
assert_equal
(
obj1
:
MyType
,
obj2
:
MyType
):
assert
torch
.
equal
(
obj1
.
tensor1
,
obj2
.
tensor1
)
assert
obj1
.
a_string
==
obj2
.
a_string
assert
all
(
torch
.
equal
(
a
,
b
)
for
a
,
b
in
zip
(
obj1
.
list_of_tensors
,
obj2
.
list_of_tensors
))
assert
np
.
array_equal
(
obj1
.
numpy_array
,
obj2
.
numpy_array
)
assert
obj1
.
unrecognized
.
an_int
==
obj2
.
unrecognized
.
an_int
assert
torch
.
equal
(
obj1
.
small_f_contig_tensor
,
obj2
.
small_f_contig_tensor
)
assert
torch
.
equal
(
obj1
.
large_f_contig_tensor
,
obj2
.
large_f_contig_tensor
)
assert
torch
.
equal
(
obj1
.
small_non_contig_tensor
,
obj2
.
small_non_contig_tensor
)
assert
torch
.
equal
(
obj1
.
large_non_contig_tensor
,
obj2
.
large_non_contig_tensor
)
tests/v1/tpu/test_pallas.py
View file @
9c4ecf15
...
...
@@ -4,9 +4,7 @@ from unittest.mock import ANY, patch
import
torch
from
vllm.attention.backends.abstract
import
AttentionType
from
vllm.v1.attention.backends.pallas
import
(
NUM_KV_PAGES_PER_BLOCK
,
NUM_QUERIES_PER_BLOCK
,
PallasAttentionBackendImpl
,
from
vllm.v1.attention.backends.pallas
import
(
PallasAttentionBackendImpl
,
PallasMetadata
)
...
...
@@ -32,8 +30,6 @@ def test_ragged_paged_attention():
logits_soft_cap
=
logits_soft_cap
,
attn_type
=
AttentionType
.
DECODER
,
)
mock_vmem_limit_bytes
=
1024
attn_impl
.
vmem_limit_bytes
=
mock_vmem_limit_bytes
class
FakeAttentionLayer
:
_k_scale_float
:
float
...
...
@@ -88,9 +84,9 @@ def test_ragged_paged_attention():
ANY
,
# block_tables
ANY
,
# query_start_loc
ANY
,
# num_seqs
num_kv_pages_per_block
=
N
UM_KV_PAGES_PER_BLOCK
,
num_queries_per_block
=
N
UM_QUERIES_PER_BLOCK
,
vmem_limit_bytes
=
mock_vmem_limit_bytes
,
num_kv_pages_per_block
=
N
one
,
num_queries_per_block
=
N
one
,
vmem_limit_bytes
=
None
,
use_kernel
=
True
,
sm_scale
=
scale
,
sliding_window
=
sliding_window
,
...
...
tests/v1/tpu/test_sampler.py
View file @
9c4ecf15
...
...
@@ -34,3 +34,8 @@ def test_sampler_different(model_name: str):
sampling_params
=
SamplingParams
(
temperature
=
0.1
,
min_p
=
0.8
,
max_tokens
=
64
)
output2
=
llm
.
generate
(
prompts
,
sampling_params
)
assert
output
[
0
].
outputs
[
0
].
text
!=
output2
[
0
].
outputs
[
0
].
text
with
pytest
.
raises
(
ValueError
):
# Unsupported `seed` param.
sampling_params
=
SamplingParams
(
temperature
=
0.3
,
seed
=
42
)
output2
=
llm
.
generate
(
prompts
,
sampling_params
)
tests/v1/tpu/worker/test_tpu_model_runner.py
View file @
9c4ecf15
...
...
@@ -7,9 +7,9 @@ from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
from
vllm.sampling_params
import
SamplingParams
from
vllm.v1.core.sched.output
import
(
CachedRequestData
,
NewRequestData
,
SchedulerOutput
)
from
vllm.v1.worker.tpu_model_runner
import
(
TPUModelRunner
,
_get_padded_token_len
,
_get
_paddings
)
from
vllm.v1.worker.tpu_model_runner
import
(
TPUModelRunner
,
_get_padded_num_reqs_with_upper_limit
,
_get_padded_token_len
,
_get_req_paddings
,
_get_token
_paddings
)
# Mock torch_xla module since it may not be available in the test environments
torch_xla_patcher
=
mock
.
patch
.
dict
(
...
...
@@ -296,16 +296,29 @@ def test_update_states_request_unscheduled(model_runner):
def
test_get_paddings
():
min_token_size
,
max_token_size
,
padding_gap
=
16
,
512
,
64
expected_paddings
=
[
16
,
32
,
64
,
128
,
192
,
256
,
320
,
384
,
448
,
512
]
actual_paddings
=
_get_paddings
(
min_token_size
,
max_token_size
,
padding_gap
)
actual_paddings
=
_get_
token_
paddings
(
min_token_size
,
max_token_size
,
padding_gap
)
assert
actual_paddings
==
expected_paddings
def
test_get_padded_token_len
():
min_token_size
,
max_token_size
,
padding_gap
=
16
,
512
,
64
paddings
=
_get_paddings
(
min_token_size
,
max_token_size
,
padding_gap
)
paddings
=
_get_
token_
paddings
(
min_token_size
,
max_token_size
,
padding_gap
)
assert
_get_padded_token_len
(
paddings
,
1
)
==
16
assert
_get_padded_token_len
(
paddings
,
16
)
==
16
assert
_get_padded_token_len
(
paddings
,
20
)
==
32
assert
_get_padded_token_len
(
paddings
,
300
)
==
320
assert
_get_padded_token_len
(
paddings
,
512
)
==
512
def
test_get_padded_num_reqs_with_upper_limit
():
assert
_get_padded_num_reqs_with_upper_limit
(
3
,
32
)
==
8
assert
_get_padded_num_reqs_with_upper_limit
(
9
,
32
)
==
16
assert
_get_padded_num_reqs_with_upper_limit
(
19
,
32
)
==
32
assert
_get_padded_num_reqs_with_upper_limit
(
17
,
28
)
==
28
def
test_get_req_paddings
():
assert
_get_req_paddings
(
1
,
32
)
==
[
8
,
16
,
32
]
assert
_get_req_paddings
(
8
,
32
)
==
[
8
,
16
,
32
]
assert
_get_req_paddings
(
8
,
36
)
==
[
8
,
16
,
32
,
36
]
tools/update-dockerfile-graph.sh
0 → 100755
View file @
9c4ecf15
#!/bin/bash
# Update Dockerfile dependency graph when docker/Dockerfile changes.
# This script is designed to be used as a pre-commit hook.
set
-euo
pipefail
# Check if docker/Dockerfile is staged for commit
if
git diff
--cached
--name-only
|
grep
-q
"^docker/Dockerfile$"
;
then
echo
"docker/Dockerfile has changed, attempting to update dependency graph..."
# Check if Docker is installed and running
if
!
command
-v
docker &> /dev/null
;
then
echo
"Warning: Docker command not found. Skipping Dockerfile graph update."
echo
"Please install Docker to automatically update the graph: https://docs.docker.com/get-docker/"
exit
0
fi
if
!
docker info &> /dev/null
;
then
echo
"Warning: Docker daemon is not running. Skipping Dockerfile graph update."
echo
"Please start Docker to automatically update the graph."
exit
0
fi
# Define the target file path
TARGET_GRAPH_FILE
=
"docs/source/assets/contributing/dockerfile-stages-dependency.png"
# Ensure target directory exists
mkdir
-p
"
$(
dirname
"
$TARGET_GRAPH_FILE
"
)
"
# Store old image hash in a variable if the file exists
OLD_HASH
=
""
if
[
-f
"
$TARGET_GRAPH_FILE
"
]
;
then
OLD_HASH
=
$(
sha256sum
"
$TARGET_GRAPH_FILE
"
)
fi
# Generate Dockerfile graph
echo
"Running dockerfilegraph tool..."
docker run
\
--rm
\
--user
"
$(
id
-u
)
:
$(
id
-g
)
"
\
--workdir
/workspace
\
--volume
"
$(
pwd
)
"
:/workspace
\
ghcr.io/patrickhoefler/dockerfilegraph:alpine
\
--output
png
\
--dpi
200
\
--max-label-length
50
\
--filename
docker/Dockerfile
\
--legend
echo
"Finding generated PNG file..."
# Check for Dockerfile.png in the root directory (most likely location)
if
[
-f
"./Dockerfile.png"
]
;
then
echo
"Found generated file at: ./Dockerfile.png"
mv
"./Dockerfile.png"
"
$TARGET_GRAPH_FILE
"
else
# Try to find it elsewhere
DOCKERFILE_PNG
=
$(
find
.
-name
"Dockerfile.png"
-type
f |
head
-1
)
if
[
-n
"
$DOCKERFILE_PNG
"
]
;
then
echo
"Found generated file at:
$DOCKERFILE_PNG
"
mv
"
$DOCKERFILE_PNG
"
"
$TARGET_GRAPH_FILE
"
else
echo
"Error: Could not find the generated PNG file"
find
.
-name
"*.png"
-type
f
-mmin
-5
exit
1
fi
fi
# Check if the graph has changed
NEW_HASH
=
$(
sha256sum
"
$TARGET_GRAPH_FILE
"
)
if
[
"
$NEW_HASH
"
!=
"
$OLD_HASH
"
]
;
then
echo
"Graph has changed. Please stage the updated file:
$TARGET_GRAPH_FILE
"
exit
1
else
echo
"No changes in graph detected."
fi
fi
exit
0
vllm/_custom_ops.py
View file @
9c4ecf15
...
...
@@ -138,6 +138,17 @@ def mla_decode_kvcache_cpu(
block_tables
,
seq_lens
)
# merge attn states ops
def
merge_attn_states
(
output
:
torch
.
Tensor
,
prefix_output
:
torch
.
Tensor
,
prefix_lse
:
torch
.
Tensor
,
suffix_output
:
torch
.
Tensor
,
suffix_lse
:
torch
.
Tensor
,
output_lse
:
Optional
[
torch
.
Tensor
]
=
None
)
->
None
:
torch
.
ops
.
_C
.
merge_attn_states
(
output
,
output_lse
,
prefix_output
,
prefix_lse
,
suffix_output
,
suffix_lse
)
# pos encoding ops
def
rotary_embedding
(
positions
:
torch
.
Tensor
,
...
...
vllm/attention/backends/flash_attn.py
View file @
9c4ecf15
...
...
@@ -326,7 +326,7 @@ class FlashAttentionMetadata(AttentionMetadata):
assert
self
.
use_cuda_graph
if
turn_prefills_into_decodes
:
# When Mu
t
li-Step is enabled with Chunked-Prefill, prefills and
# When Mul
t
i-Step is enabled with Chunked-Prefill, prefills and
# decodes are scheduled together. In the first step, all the
# prefills turn into decodes. This update reflects that
# conversion.
...
...
@@ -617,10 +617,15 @@ class FlashAttentionImpl(AttentionImpl):
blocksparse_params
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
attn_type
:
str
=
AttentionType
.
DECODER
,
use_irope
:
bool
=
False
,
)
->
None
:
if
blocksparse_params
is
not
None
:
raise
ValueError
(
"FlashAttention does not support block-sparse attention."
)
if
use_irope
:
logger
.
warning
(
"Using irope in V0 is not supported yet, it will fall back "
"to global attention for long context."
)
self
.
num_heads
=
num_heads
self
.
head_size
=
head_size
self
.
scale
=
float
(
scale
)
...
...
vllm/attention/backends/flashinfer.py
View file @
9c4ecf15
...
...
@@ -38,9 +38,12 @@ from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
from
vllm.attention.layer
import
Attention
from
vllm.attention.ops.paged_attn
import
PagedAttention
from
vllm.config
import
VllmConfig
,
get_current_vllm_config
from
vllm.logger
import
init_logger
from
vllm.utils
import
(
async_tensor_h2d
,
get_kv_cache_torch_dtype
,
make_tensor_with_pad
)
logger
=
init_logger
(
__name__
)
if
TYPE_CHECKING
:
from
vllm.worker.model_runner
import
(
ModelInputForGPUBuilder
,
ModelInputForGPUWithSamplingMetadata
)
...
...
@@ -907,7 +910,12 @@ class FlashInferImpl(AttentionImpl):
blocksparse_params
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
attn_type
:
str
=
AttentionType
.
DECODER
,
use_irope
:
bool
=
False
,
)
->
None
:
if
use_irope
:
logger
.
warning_once
(
"Using irope in FlashInfer is not supported yet, it will fall"
" back to global attention for long context."
)
self
.
num_heads
=
num_heads
self
.
head_size
=
head_size
self
.
scale
=
float
(
scale
)
...
...
vllm/attention/backends/hpu_attn.py
View file @
9c4ecf15
...
...
@@ -108,8 +108,13 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
blocksparse_params
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
max_seq_len
:
int
=
4096
,
attn_type
:
str
=
AttentionType
.
DECODER
,
use_irope
:
bool
=
False
,
)
->
None
:
super
(
AttentionImpl
,
self
).
__init__
()
if
use_irope
:
logger
.
warning_once
(
"Using irope in HPU is not supported yet, it will fall back "
"to global attention for long context."
)
self
.
kv_cache_dtype
=
kv_cache_dtype
self
.
num_heads
=
num_heads
self
.
head_size
=
head_size
...
...
@@ -144,14 +149,14 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
self
.
fused_scaled_dot_product_attention
=
ModuleFusedSDPA
(
FusedSDPA
)
except
ImportError
:
logger
()
.
warning
(
"Could not import HPU FusedSDPA kernel. "
"vLLM will use native implementation."
)
logger
.
warning
(
"Could not import HPU FusedSDPA kernel. "
"vLLM will use native implementation."
)
suppored_head_sizes
=
HPUPagedAttention
.
get_supported_head_sizes
()
if
head_size
not
in
suppored_head_sizes
:
suppor
t
ed_head_sizes
=
HPUPagedAttention
.
get_supported_head_sizes
()
if
head_size
not
in
suppor
t
ed_head_sizes
:
raise
ValueError
(
f
"Head size
{
head_size
}
is not supported by PagedAttention. "
f
"Supported head sizes are:
{
suppored_head_sizes
}
."
)
f
"Supported head sizes are:
{
suppor
t
ed_head_sizes
}
."
)
if
attn_type
!=
AttentionType
.
DECODER
:
raise
NotImplementedError
(
"Encoder self-attention and "
...
...
Prev
1
…
4
5
6
7
8
9
10
11
12
…
18
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment