Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
9c4ecf15
"docs/vscode:/vscode.git/clone" did not exist on "189860102539b54098cfa04b6381ee86c53a16c1"
Commit
9c4ecf15
authored
Apr 14, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.4' into v0.8.4-ori
parents
bfc2d6f7
dc1b4a6f
Changes
342
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
626 additions
and
86 deletions
+626
-86
tests/test_sharded_state_loader.py
tests/test_sharded_state_loader.py
+10
-9
tests/tool_use/conftest.py
tests/tool_use/conftest.py
+24
-1
tests/tool_use/utils.py
tests/tool_use/utils.py
+16
-0
tests/tpu/test_compilation.py
tests/tpu/test_compilation.py
+10
-4
tests/v1/core/test_kv_cache_utils.py
tests/v1/core/test_kv_cache_utils.py
+136
-42
tests/v1/core/test_scheduler.py
tests/v1/core/test_scheduler.py
+54
-0
tests/v1/e2e/test_spec_decode.py
tests/v1/e2e/test_spec_decode.py
+49
-0
tests/v1/engine/test_engine_args.py
tests/v1/engine/test_engine_args.py
+4
-2
tests/v1/engine/test_engine_core_client.py
tests/v1/engine/test_engine_core_client.py
+41
-0
tests/v1/entrypoints/llm/test_struct_output_generate.py
tests/v1/entrypoints/llm/test_struct_output_generate.py
+39
-0
tests/v1/structured_output/test_utils.py
tests/v1/structured_output/test_utils.py
+8
-8
tests/v1/test_serial_utils.py
tests/v1/test_serial_utils.py
+94
-0
tests/v1/tpu/test_pallas.py
tests/v1/tpu/test_pallas.py
+4
-8
tests/v1/tpu/test_sampler.py
tests/v1/tpu/test_sampler.py
+5
-0
tests/v1/tpu/worker/test_tpu_model_runner.py
tests/v1/tpu/worker/test_tpu_model_runner.py
+19
-6
tools/update-dockerfile-graph.sh
tools/update-dockerfile-graph.sh
+78
-0
vllm/_custom_ops.py
vllm/_custom_ops.py
+11
-0
vllm/attention/backends/flash_attn.py
vllm/attention/backends/flash_attn.py
+6
-1
vllm/attention/backends/flashinfer.py
vllm/attention/backends/flashinfer.py
+8
-0
vllm/attention/backends/hpu_attn.py
vllm/attention/backends/hpu_attn.py
+10
-5
No files found.
tests/test_sharded_state_loader.py
View file @
9c4ecf15
...
@@ -47,12 +47,10 @@ def test_filter_subtensors():
...
@@ -47,12 +47,10 @@ def test_filter_subtensors():
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
llama_3p2_1b_files
():
def
llama_3p2_1b_files
():
with
TemporaryDirectory
()
as
cache_dir
:
input_dir
=
snapshot_download
(
"meta-llama/Llama-3.2-1B-Instruct"
,
input_dir
=
snapshot_download
(
"meta-llama/Llama-3.2-1B-Instruct"
,
ignore_patterns
=
[
"*.bin*"
,
"original/*"
])
cache_dir
=
cache_dir
,
ignore_patterns
=
[
"*.bin*"
,
"original/*"
])
yield
input_dir
yield
input_dir
def
_run_writer
(
input_dir
,
output_dir
,
weights_patterns
,
**
kwargs
):
def
_run_writer
(
input_dir
,
output_dir
,
weights_patterns
,
**
kwargs
):
...
@@ -64,9 +62,9 @@ def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
...
@@ -64,9 +62,9 @@ def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
# Copy metadata files to output directory
# Copy metadata files to output directory
for
file
in
os
.
listdir
(
input_dir
):
for
file
in
os
.
listdir
(
input_dir
):
if
not
any
(
if
os
.
path
.
isdir
(
os
.
path
.
join
(
input_dir
,
file
)):
file
.
endswith
(
ext
)
and
not
os
.
path
.
isdir
(
file
)
continue
for
ext
in
weights_patterns
):
if
not
any
(
file
.
endswith
(
ext
)
for
ext
in
weights_patterns
):
shutil
.
copy
(
f
"
{
input_dir
}
/
{
file
}
"
,
output_dir
)
shutil
.
copy
(
f
"
{
input_dir
}
/
{
file
}
"
,
output_dir
)
...
@@ -81,7 +79,8 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
...
@@ -81,7 +79,8 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
@
pytest
.
mark
.
parametrize
(
"enable_lora"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"enable_lora"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
,
2
])
def
test_sharded_state_loader
(
enable_lora
,
tp_size
,
num_gpus_available
,
def
test_sharded_state_loader
(
enable_lora
,
tp_size
,
num_gpus_available
,
llama_3p2_1b_files
):
llama_3p2_1b_files
,
monkeypatch
:
pytest
.
MonkeyPatch
):
if
num_gpus_available
<
tp_size
:
if
num_gpus_available
<
tp_size
:
pytest
.
skip
(
f
"Not enough GPUs for tensor parallelism
{
tp_size
}
"
)
pytest
.
skip
(
f
"Not enough GPUs for tensor parallelism
{
tp_size
}
"
)
...
@@ -89,6 +88,8 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
...
@@ -89,6 +88,8 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
gpu_memory_utilization
=
0.8
gpu_memory_utilization
=
0.8
input_dir
=
llama_3p2_1b_files
input_dir
=
llama_3p2_1b_files
ctx
=
mp
.
get_context
(
"spawn"
)
ctx
=
mp
.
get_context
(
"spawn"
)
# The interface in v1 engine has changed, run in v1 engine will hang.
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
# Run in separate processes for memory & CUDA isolation
# Run in separate processes for memory & CUDA isolation
with
TemporaryDirectory
()
as
output_dir
:
with
TemporaryDirectory
()
as
output_dir
:
...
...
tests/tool_use/conftest.py
View file @
9c4ecf15
...
@@ -10,10 +10,33 @@ from vllm.platforms import current_platform
...
@@ -10,10 +10,33 @@ from vllm.platforms import current_platform
from
.utils
import
ARGS
,
CONFIGS
,
ServerConfig
from
.utils
import
ARGS
,
CONFIGS
,
ServerConfig
# select models to test based on command line arguments
def
pytest_addoption
(
parser
):
parser
.
addoption
(
"--models"
,
nargs
=
"+"
,
help
=
"Specify one or more models to test"
)
parser
.
addoption
(
"--extended"
,
action
=
"store_true"
,
default
=
False
,
help
=
"invoke extended tests requiring large GPUs"
)
# for each server config, download the model and return the config
# for each server config, download the model and return the config
@
pytest
.
fixture
(
scope
=
"session"
,
params
=
CONFIGS
.
keys
())
@
pytest
.
fixture
(
scope
=
"session"
,
params
=
CONFIGS
.
keys
())
def
server_config
(
request
):
def
server_config
(
request
):
config
=
CONFIGS
[
request
.
param
]
extended
=
request
.
config
.
getoption
(
"--extended"
)
models
=
request
.
config
.
getoption
(
"--models"
)
config_keys_to_test
=
[
key
for
key
in
CONFIGS
if
(
models
is
None
or
key
in
models
)
and
(
extended
or
not
CONFIGS
[
key
].
get
(
"extended"
,
False
))
]
config_key
=
request
.
param
if
config_key
not
in
config_keys_to_test
:
pytest
.
skip
(
f
"Skipping config '
{
config_key
}
'"
)
config
=
CONFIGS
[
config_key
]
if
current_platform
.
is_rocm
()
and
not
config
.
get
(
"supports_rocm"
,
True
):
if
current_platform
.
is_rocm
()
and
not
config
.
get
(
"supports_rocm"
,
True
):
pytest
.
skip
(
"The {} model can't be tested on the ROCm platform"
.
format
(
pytest
.
skip
(
"The {} model can't be tested on the ROCm platform"
.
format
(
...
...
tests/tool_use/utils.py
View file @
9c4ecf15
...
@@ -16,6 +16,7 @@ class ServerConfig(TypedDict, total=False):
...
@@ -16,6 +16,7 @@ class ServerConfig(TypedDict, total=False):
system_prompt
:
Optional
[
str
]
system_prompt
:
Optional
[
str
]
supports_parallel
:
Optional
[
bool
]
supports_parallel
:
Optional
[
bool
]
supports_rocm
:
Optional
[
bool
]
supports_rocm
:
Optional
[
bool
]
extended
:
Optional
[
bool
]
# tests do not run in CI automatically
def
patch_system_prompt
(
messages
:
list
[
dict
[
str
,
Any
]],
def
patch_system_prompt
(
messages
:
list
[
dict
[
str
,
Any
]],
...
@@ -82,6 +83,21 @@ CONFIGS: dict[str, ServerConfig] = {
...
@@ -82,6 +83,21 @@ CONFIGS: dict[str, ServerConfig] = {
"supports_parallel"
:
"supports_parallel"
:
False
,
False
,
},
},
"llama4"
:
{
"model"
:
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
"arguments"
:
[
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--tool-call-parser"
,
"pythonic"
,
"--chat-template"
,
str
(
VLLM_PATH
/
"examples/tool_chat_template_llama4_pythonic.jinja"
),
"-tp"
,
"4"
],
"supports_parallel"
:
False
,
"extended"
:
True
},
"mistral"
:
{
"mistral"
:
{
"model"
:
"model"
:
"mistralai/Mistral-7B-Instruct-v0.3"
,
"mistralai/Mistral-7B-Instruct-v0.3"
,
...
...
tests/tpu/test_compilation.py
View file @
9c4ecf15
...
@@ -44,7 +44,7 @@ def test_tpu_compilation():
...
@@ -44,7 +44,7 @@ def test_tpu_compilation():
assert
generated_text
.
startswith
(
answer
)
assert
generated_text
.
startswith
(
answer
)
compiled_codes
=
sorted
(
compiled_codes
=
sorted
(
glob
.
glob
(
os
.
path
.
join
(
temp_dir
,
"__transformed_code*.py"
)))
glob
.
glob
(
os
.
path
.
join
(
temp_dir
,
"__transformed_code*
for_forward
.py"
)))
for
i
,
compiled_code
in
enumerate
(
compiled_codes
):
for
i
,
compiled_code
in
enumerate
(
compiled_codes
):
print
(
"{} file: {}"
.
format
(
i
+
1
,
compiled_code
))
print
(
"{} file: {}"
.
format
(
i
+
1
,
compiled_code
))
...
@@ -52,15 +52,21 @@ def test_tpu_compilation():
...
@@ -52,15 +52,21 @@ def test_tpu_compilation():
# We should only trigger Dynamo compilation 2 times:
# We should only trigger Dynamo compilation 2 times:
# 1. Forward pass without kv_caches
# 1. Forward pass without kv_caches
# 2. Forward pass with kv_caches
# 2. Forward pass with kv_caches
# Check we have
4
compiled codes
# Check we have
2
compiled codes
assert
len
(
compiled_codes
)
==
2
assert
len
(
compiled_codes
)
==
2
kv_cache_prefix
=
"kv_cache"
kv_cache_prefix
=
"kv_cache"
attn_prefix
=
"ragged_paged_attention"
attn_prefix
=
"ragged_paged_attention"
def
extract_compiled_index
(
s
):
parts
=
s
.
replace
(
"."
,
"_"
).
split
(
"_"
)
numbers
=
[
int
(
part
)
for
part
in
parts
if
part
.
isdigit
()]
return
numbers
[
0
]
# Check all the compilations are as expected
# Check all the compilations are as expected
compiled_fns
=
sorted
(
compiled_fns
=
sorted
(
glob
.
glob
(
glob
.
glob
(
os
.
path
.
join
(
temp_dir
,
"__compiled_fn*Captured*.py"
)))
os
.
path
.
join
(
temp_dir
,
"__compiled_fn*Captured*.py"
)),
key
=
lambda
s
:
extract_compiled_index
(
s
))
for
i
,
compiled_fn
in
enumerate
(
compiled_fns
):
for
i
,
compiled_fn
in
enumerate
(
compiled_fns
):
print
(
"{} file: {}"
.
format
(
i
+
1
,
compiled_fn
))
print
(
"{} file: {}"
.
format
(
i
+
1
,
compiled_fn
))
...
...
tests/v1/core/test_kv_cache_utils.py
View file @
9c4ecf15
...
@@ -3,14 +3,17 @@
...
@@ -3,14 +3,17 @@
import
pytest
import
pytest
import
torch
import
torch
from
vllm.multimodal.inputs
import
MultiModalKwargs
from
vllm.config
import
ModelConfig
,
SchedulerConfig
,
VllmConfig
from
vllm.multimodal.inputs
import
MultiModalKwargs
,
PlaceholderRange
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
from
vllm.utils
import
sha256
from
vllm.utils
import
GiB_bytes
,
sha256
from
vllm.v1.core.kv_cache_manager
import
KVCacheManager
# disable yapf here as it formats differently than isort such that both fail
# disable yapf here as it formats differently than isort such that both fail
# yapf: disable
# yapf: disable
from
vllm.v1.core.kv_cache_utils
import
(
NONE_HASH
,
BlockHashType
,
from
vllm.v1.core.kv_cache_utils
import
(
NONE_HASH
,
BlockHashType
,
FreeKVCacheBlockQueue
,
KVCacheBlock
,
FreeKVCacheBlockQueue
,
KVCacheBlock
,
PrefixCachingMetrics
,
PrefixCachingMetrics
,
estimate_max_model_len
,
generate_block_hash_extra_keys
,
generate_block_hash_extra_keys
,
hash_block_tokens
,
hash_block_tokens
,
hash_request_tokens
,
hash_request_tokens
,
...
@@ -46,6 +49,18 @@ def make_request(request_id,
...
@@ -46,6 +49,18 @@ def make_request(request_id,
)
)
def
new_kv_cache_spec
(
block_size
=
16
,
num_kv_heads
=
2
,
head_size
=
64
,
dtype
=
torch
.
float32
,
use_mla
=
False
):
return
FullAttentionSpec
(
block_size
=
block_size
,
num_kv_heads
=
num_kv_heads
,
head_size
=
head_size
,
dtype
=
dtype
,
use_mla
=
use_mla
)
def
test_none_hash
():
def
test_none_hash
():
assert
NONE_HASH
is
not
None
assert
NONE_HASH
is
not
None
assert
isinstance
(
NONE_HASH
,
int
)
assert
isinstance
(
NONE_HASH
,
int
)
...
@@ -158,13 +173,10 @@ def test_generate_block_hash_extra_keys():
...
@@ -158,13 +173,10 @@ def test_generate_block_hash_extra_keys():
request
=
make_request
(
request
=
make_request
(
request_id
=
0
,
request_id
=
0
,
prompt_token_ids
=
[
_
for
_
in
range
(
20
)],
prompt_token_ids
=
[
_
for
_
in
range
(
20
)],
mm_positions
=
[{
mm_positions
=
[
"offset"
:
0
,
PlaceholderRange
(
offset
=
0
,
length
=
5
),
"length"
:
5
PlaceholderRange
(
offset
=
10
,
length
=
5
),
},
{
],
"offset"
:
10
,
"length"
:
5
}],
mm_hashes
=
[
"hash1"
,
"hash2"
],
mm_hashes
=
[
"hash1"
,
"hash2"
],
)
)
...
@@ -222,13 +234,10 @@ def test_hash_request_tokens(hash_fn):
...
@@ -222,13 +234,10 @@ def test_hash_request_tokens(hash_fn):
request
=
make_request
(
request
=
make_request
(
request_id
=
0
,
request_id
=
0
,
prompt_token_ids
=
[
_
for
_
in
range
(
6
)],
prompt_token_ids
=
[
_
for
_
in
range
(
6
)],
mm_positions
=
[{
mm_positions
=
[
"offset"
:
0
,
PlaceholderRange
(
offset
=
0
,
length
=
3
),
"length"
:
3
PlaceholderRange
(
offset
=
3
,
length
=
3
),
},
{
],
"offset"
:
3
,
"length"
:
3
}],
mm_hashes
=
[
"hash1"
,
"hash2"
],
mm_hashes
=
[
"hash1"
,
"hash2"
],
)
)
...
@@ -253,25 +262,19 @@ def test_hash_tokens_different_mm_input(hash_fn):
...
@@ -253,25 +262,19 @@ def test_hash_tokens_different_mm_input(hash_fn):
request1
=
make_request
(
request1
=
make_request
(
request_id
=
0
,
request_id
=
0
,
prompt_token_ids
=
[
_
for
_
in
range
(
6
)],
prompt_token_ids
=
[
_
for
_
in
range
(
6
)],
mm_positions
=
[{
mm_positions
=
[
"offset"
:
0
,
PlaceholderRange
(
offset
=
0
,
length
=
3
),
"length"
:
3
PlaceholderRange
(
offset
=
3
,
length
=
3
),
},
{
],
"offset"
:
3
,
"length"
:
3
}],
mm_hashes
=
[
"hash1"
,
"hash2"
],
mm_hashes
=
[
"hash1"
,
"hash2"
],
)
)
request2
=
make_request
(
request2
=
make_request
(
request_id
=
1
,
request_id
=
1
,
prompt_token_ids
=
[
_
for
_
in
range
(
6
)],
prompt_token_ids
=
[
_
for
_
in
range
(
6
)],
mm_positions
=
[{
mm_positions
=
[
"offset"
:
0
,
PlaceholderRange
(
offset
=
0
,
length
=
3
),
"length"
:
3
PlaceholderRange
(
offset
=
3
,
length
=
3
),
},
{
],
"offset"
:
3
,
"length"
:
3
}],
mm_hashes
=
[
"hash3"
,
"hash2"
],
mm_hashes
=
[
"hash3"
,
"hash2"
],
)
)
block_size
=
3
block_size
=
3
...
@@ -337,18 +340,6 @@ def test_metrics():
...
@@ -337,18 +340,6 @@ def test_metrics():
def
test_unify_kv_cache_configs
():
def
test_unify_kv_cache_configs
():
def
new_kv_cache_spec
(
block_size
=
16
,
num_kv_heads
=
2
,
head_size
=
64
,
dtype
=
torch
.
float32
,
use_mla
=
False
):
return
FullAttentionSpec
(
block_size
=
block_size
,
num_kv_heads
=
num_kv_heads
,
head_size
=
head_size
,
dtype
=
dtype
,
use_mla
=
use_mla
)
same_kv_cache_config
=
[
same_kv_cache_config
=
[
KVCacheConfig
(
KVCacheConfig
(
num_blocks
=
10
,
num_blocks
=
10
,
...
@@ -438,3 +429,106 @@ def test_unify_kv_cache_configs():
...
@@ -438,3 +429,106 @@ def test_unify_kv_cache_configs():
]
]
with
pytest
.
raises
(
AssertionError
):
with
pytest
.
raises
(
AssertionError
):
unify_kv_cache_configs
(
diff_kv_cache_config
)
unify_kv_cache_configs
(
diff_kv_cache_config
)
@
pytest
.
mark
.
parametrize
(
(
"model_id"
,
"max_model_len"
,
"want_estimated_max_len"
),
[
(
"Qwen/Qwen1.5-7B"
,
16385
,
16384
),
(
"Qwen/Qwen1.5-7B"
,
16383
,
16383
),
])
def
test_estimate_max_model_len
(
model_id
,
max_model_len
,
want_estimated_max_len
):
# Create a VllmConfig
model_config
=
ModelConfig
(
model_id
,
task
=
"generate"
,
tokenizer
=
model_id
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
seed
=
0
,
dtype
=
"float16"
,
max_model_len
=
max_model_len
,
)
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
=
32768
)
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
scheduler_config
=
scheduler_config
,
)
# Create KV cache specs
kv_cache_spec
=
{}
for
i
in
range
(
32
):
layer_name
=
f
"layer_
{
i
}
"
kv_cache_spec
[
layer_name
]
=
FullAttentionSpec
(
block_size
=
16
,
num_kv_heads
=
32
,
head_size
=
128
,
dtype
=
torch
.
float16
,
use_mla
=
False
,
)
# Estimate the maximum model length, 16384 model_len need 8GB
estimated_max_len
=
estimate_max_model_len
(
vllm_config
,
kv_cache_spec
,
8
*
GiB_bytes
)
assert
estimated_max_len
==
want_estimated_max_len
def
test_allocate_with_lookahead
():
"""Verify that lookahead tokens correctly affect block allocation"""
block_size
=
4
config
=
KVCacheConfig
(
num_blocks
=
10
,
tensors
=
{
"layer1"
:
KVCacheTensor
(
100
),
},
kv_cache_groups
=
[
KVCacheGroupSpec
([
"layer1"
],
new_kv_cache_spec
(
block_size
=
block_size
)),
],
)
request
=
make_request
(
request_id
=
0
,
prompt_token_ids
=
[],
mm_positions
=
None
,
mm_hashes
=
None
,
)
# Test case 1: Requires additional lookahead tokens
kv_cache_manager
=
KVCacheManager
(
kv_cache_config
=
config
,
max_model_len
=
100
,
num_preallocate_tokens
=
0
)
blocks
=
kv_cache_manager
.
allocate_slots
(
request
,
num_tokens
=
3
,
num_lookahead_tokens
=
2
,
# Total required: 3+2=5 tokens
)
assert
len
(
blocks
)
==
2
# ceil(5/4)=2 blocks
# Test case 2: With precomputed blocks
kv_cache_manager
=
KVCacheManager
(
kv_cache_config
=
config
,
max_model_len
=
100
,
num_preallocate_tokens
=
4
)
# num_preallocate_blocks = 4 // 4 - 2 // 4 = 1
# required_blocks = ceil((3 + 2) /4) = 2
# total_blocks = 1 + 2 = 3
blocks
=
kv_cache_manager
.
allocate_slots
(
request
,
num_tokens
=
3
,
num_lookahead_tokens
=
2
,
)
assert
len
(
blocks
)
==
3
# Test case 3: With precomputed blocks
# num_preallocate_blocks = 4 // 4 - 4 // 4 = 0
# required_blocks = ceil((3 + 4) / 4) = 2
# total_blocks = 0 + 2 = 2
kv_cache_manager
=
KVCacheManager
(
kv_cache_config
=
config
,
max_model_len
=
100
,
num_preallocate_tokens
=
4
)
blocks
=
kv_cache_manager
.
allocate_slots
(
request
,
num_tokens
=
3
,
num_lookahead_tokens
=
4
,
)
assert
len
(
blocks
)
==
2
tests/v1/core/test_scheduler.py
View file @
9c4ecf15
...
@@ -24,6 +24,7 @@ def create_scheduler(
...
@@ -24,6 +24,7 @@ def create_scheduler(
max_num_batched_tokens
:
int
=
8192
,
max_num_batched_tokens
:
int
=
8192
,
enable_prefix_caching
:
Optional
[
bool
]
=
None
,
enable_prefix_caching
:
Optional
[
bool
]
=
None
,
long_prefill_token_threshold
:
int
=
0
,
long_prefill_token_threshold
:
int
=
0
,
disable_chunked_mm_input
:
bool
=
False
,
)
->
Scheduler
:
)
->
Scheduler
:
'''Create scheduler under test.
'''Create scheduler under test.
...
@@ -43,6 +44,7 @@ def create_scheduler(
...
@@ -43,6 +44,7 @@ def create_scheduler(
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_model_len
=
max_num_batched_tokens
,
max_model_len
=
max_num_batched_tokens
,
long_prefill_token_threshold
=
long_prefill_token_threshold
,
long_prefill_token_threshold
=
long_prefill_token_threshold
,
disable_chunked_mm_input
=
disable_chunked_mm_input
,
)
)
model_config
=
ModelConfig
(
model_config
=
ModelConfig
(
model
=
model
,
model
=
model
,
...
@@ -278,6 +280,58 @@ def test_schedule_partial_requests():
...
@@ -278,6 +280,58 @@ def test_schedule_partial_requests():
assert
requests
[
2
].
request_id
not
in
output
.
num_scheduled_tokens
assert
requests
[
2
].
request_id
not
in
output
.
num_scheduled_tokens
def
test_no_mm_input_chunking
():
# Disable multimodal input chunking.
scheduler
=
create_scheduler
(
model
=
"llava-hf/llava-1.5-7b-hf"
,
max_num_batched_tokens
=
1024
,
disable_chunked_mm_input
=
True
,
)
mm_positions
=
[[
PlaceholderRange
(
offset
=
400
,
length
=
800
)]]
requests
=
create_requests
(
num_requests
=
1
,
num_tokens
=
1200
,
mm_positions
=
mm_positions
)
for
request
in
requests
:
scheduler
.
add_request
(
request
)
output
=
scheduler
.
schedule
()
assert
len
(
output
.
scheduled_new_reqs
)
==
1
assert
len
(
output
.
scheduled_cached_reqs
)
==
0
assert
len
(
output
.
finished_req_ids
)
==
0
# We want to only see the 400 text tokens at the start scheduled
assert
output
.
num_scheduled_tokens
[
requests
[
0
].
request_id
]
==
400
req_to_index
=
{
request
.
request_id
:
i
for
i
,
request
in
enumerate
(
requests
)
}
model_runner_output
=
ModelRunnerOutput
(
req_ids
=
[
request
.
request_id
for
request
in
requests
],
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[[]
for
_
in
range
(
len
(
requests
))],
spec_token_ids
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
)
scheduler
.
update_from_output
(
output
,
model_runner_output
)
output
=
scheduler
.
schedule
()
assert
len
(
scheduler
.
running
)
==
1
assert
len
(
output
.
scheduled_new_reqs
)
==
0
assert
len
(
output
.
scheduled_cached_reqs
)
==
1
assert
len
(
output
.
finished_req_ids
)
==
0
assert
output
.
num_scheduled_tokens
[
requests
[
0
].
request_id
]
==
800
# Test that we fail if we disable chunked mm input and use too small
# of a max_num_batched_tokens for the mm input.
with
pytest
.
raises
(
ValueError
):
_
=
create_scheduler
(
model
=
"llava-hf/llava-1.5-7b-hf"
,
max_num_batched_tokens
=
100
,
disable_chunked_mm_input
=
True
,
)
@
pytest
.
mark
.
parametrize
(
"enable_prefix_caching"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"enable_prefix_caching"
,
[
True
,
False
])
def
test_schedule_concurrent_partial_requests
(
enable_prefix_caching
:
bool
):
def
test_schedule_concurrent_partial_requests
(
enable_prefix_caching
:
bool
):
"""Test scheduling behavior with concurrent partial requests.
"""Test scheduling behavior with concurrent partial requests.
...
...
tests/v1/e2e/test_
ngram_
spec_decode.py
→
tests/v1/e2e/test_spec_decode.py
View file @
9c4ecf15
...
@@ -53,6 +53,11 @@ def model_name():
...
@@ -53,6 +53,11 @@ def model_name():
return
"meta-llama/Meta-Llama-3-8B-Instruct"
return
"meta-llama/Meta-Llama-3-8B-Instruct"
@
pytest
.
fixture
def
eagle_model_name
():
return
"yuhuili/EAGLE-LLaMA3-Instruct-8B"
def
test_ngram_correctness
(
def
test_ngram_correctness
(
monkeypatch
:
pytest
.
MonkeyPatch
,
monkeypatch
:
pytest
.
MonkeyPatch
,
test_prompts
:
list
[
list
[
dict
[
str
,
Any
]]],
test_prompts
:
list
[
list
[
dict
[
str
,
Any
]]],
...
@@ -95,3 +100,47 @@ def test_ngram_correctness(
...
@@ -95,3 +100,47 @@ def test_ngram_correctness(
# Upon failure, inspect the outputs to check for inaccuracy.
# Upon failure, inspect the outputs to check for inaccuracy.
assert
matches
>
int
(
0.7
*
len
(
ref_outputs
))
assert
matches
>
int
(
0.7
*
len
(
ref_outputs
))
del
spec_llm
del
spec_llm
def
test_eagle_correctness
(
monkeypatch
:
pytest
.
MonkeyPatch
,
test_prompts
:
list
[
list
[
dict
[
str
,
Any
]]],
sampling_config
:
SamplingParams
,
model_name
:
str
,
eagle_model_name
:
str
,
):
'''
Compare the outputs of a original LLM and a speculative LLM
should be the same when using eagle speculative decoding.
'''
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
ref_llm
=
LLM
(
model
=
model_name
,
max_model_len
=
1024
)
ref_outputs
=
ref_llm
.
chat
(
test_prompts
,
sampling_config
)
del
ref_llm
spec_llm
=
LLM
(
model
=
model_name
,
speculative_config
=
{
"method"
:
"eagle"
,
"model"
:
eagle_model_name
,
"num_speculative_tokens"
:
3
,
},
max_model_len
=
1024
,
)
spec_outputs
=
spec_llm
.
chat
(
test_prompts
,
sampling_config
)
matches
=
0
misses
=
0
for
ref_output
,
spec_output
in
zip
(
ref_outputs
,
spec_outputs
):
if
ref_output
.
outputs
[
0
].
text
==
spec_output
.
outputs
[
0
].
text
:
matches
+=
1
else
:
misses
+=
1
print
(
f
"ref_output:
{
ref_output
.
outputs
[
0
].
text
}
"
)
print
(
f
"spec_output:
{
spec_output
.
outputs
[
0
].
text
}
"
)
# Heuristic: expect at least 70% of the prompts to match exactly
# Upon failure, inspect the outputs to check for inaccuracy.
assert
matches
>
int
(
0.7
*
len
(
ref_outputs
))
del
spec_llm
tests/v1/engine/test_engine_args.py
View file @
9c4ecf15
...
@@ -64,15 +64,17 @@ def test_defaults_with_usage_context():
...
@@ -64,15 +64,17 @@ def test_defaults_with_usage_context():
# For H100 and H200, we use larger default values.
# For H100 and H200, we use larger default values.
default_llm_tokens
=
16384
default_llm_tokens
=
16384
default_server_tokens
=
8192
default_server_tokens
=
8192
default_max_num_seqs
=
1024
else
:
else
:
default_llm_tokens
=
8192
default_llm_tokens
=
8192
default_server_tokens
=
2048
default_server_tokens
=
2048
default_max_num_seqs
=
256
assert
vllm_config
.
scheduler_config
.
max_num_seqs
==
1024
assert
vllm_config
.
scheduler_config
.
max_num_seqs
==
default_max_num_seqs
assert
vllm_config
.
scheduler_config
.
max_num_batched_tokens
==
default_llm_tokens
# noqa: E501
assert
vllm_config
.
scheduler_config
.
max_num_batched_tokens
==
default_llm_tokens
# noqa: E501
engine_args
=
EngineArgs
(
model
=
"facebook/opt-125m"
)
engine_args
=
EngineArgs
(
model
=
"facebook/opt-125m"
)
vllm_config
=
engine_args
.
create_engine_config
(
vllm_config
=
engine_args
.
create_engine_config
(
UsageContext
.
OPENAI_API_SERVER
)
UsageContext
.
OPENAI_API_SERVER
)
assert
vllm_config
.
scheduler_config
.
max_num_seqs
==
1024
assert
vllm_config
.
scheduler_config
.
max_num_seqs
==
default_max_num_seqs
assert
vllm_config
.
scheduler_config
.
max_num_batched_tokens
==
default_server_tokens
# noqa: E501
assert
vllm_config
.
scheduler_config
.
max_num_batched_tokens
==
default_server_tokens
# noqa: E501
tests/v1/engine/test_engine_core_client.py
View file @
9c4ecf15
...
@@ -3,8 +3,10 @@
...
@@ -3,8 +3,10 @@
import
asyncio
import
asyncio
import
time
import
time
import
uuid
import
uuid
from
threading
import
Thread
from
typing
import
Optional
from
typing
import
Optional
import
psutil
import
pytest
import
pytest
from
transformers
import
AutoTokenizer
from
transformers
import
AutoTokenizer
...
@@ -245,3 +247,42 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
...
@@ -245,3 +247,42 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
await
core_client
.
call_utility_async
(
"echo"
,
None
,
"help!"
)
await
core_client
.
call_utility_async
(
"echo"
,
None
,
"help!"
)
assert
str
(
e_info
.
value
)
==
"Call to echo method failed: help!"
assert
str
(
e_info
.
value
)
==
"Call to echo method failed: help!"
@
pytest
.
mark
.
timeout
(
10
)
def
test_startup_failure
(
monkeypatch
:
pytest
.
MonkeyPatch
):
with
monkeypatch
.
context
()
as
m
,
pytest
.
raises
(
Exception
)
as
e_info
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
engine_args
=
EngineArgs
(
model
=
MODEL_NAME
)
vllm_config
=
engine_args
.
create_engine_config
(
usage_context
=
UsageContext
.
UNKNOWN_CONTEXT
)
executor_class
=
Executor
.
get_class
(
vllm_config
)
# Start another thread to wait for engine core process to start
# and kill it - simulate fatal uncaught process exit.
this_proc
=
psutil
.
Process
()
children_before
=
set
(
this_proc
.
children
())
def
kill_first_child
():
while
True
:
time
.
sleep
(
0.5
)
children
=
set
(
this_proc
.
children
())
-
children_before
if
children
:
child
=
children
.
pop
()
print
(
"Killing child core process"
,
child
.
pid
)
child
.
kill
()
break
Thread
(
target
=
kill_first_child
,
daemon
=
True
).
start
()
_core_client
=
EngineCoreClient
.
make_client
(
multiprocess_mode
=
True
,
asyncio_mode
=
True
,
vllm_config
=
vllm_config
,
executor_class
=
executor_class
,
log_stats
=
True
,
)
assert
"Engine core initialization failed"
in
str
(
e_info
.
value
)
tests/v1/entrypoints/llm/test_struct_output_generate.py
View file @
9c4ecf15
...
@@ -325,6 +325,45 @@ def test_structured_output(
...
@@ -325,6 +325,45 @@ def test_structured_output(
output_json
=
json
.
loads
(
generated_text
)
output_json
=
json
.
loads
(
generated_text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
json_schema
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
json_schema
)
#
# Test 10: Generate structured with minLength and maxLength
#
min_length
=
50
max_length
=
50
json_schema
=
{
"type"
:
"object"
,
"properties"
:
{
"description"
:
{
"type"
:
"string"
,
"maxLength"
:
max_length
,
"minLength"
:
min_length
}
},
"required"
:
[
"description"
]
}
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
max_tokens
=
1000
,
guided_decoding
=
GuidedDecodingParams
(
json
=
json_schema
))
outputs
=
llm
.
generate
(
prompts
=
"Generate a description of a frog using 50 characters."
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
assert
outputs
is
not
None
for
output
in
outputs
:
assert
output
is
not
None
assert
isinstance
(
output
,
RequestOutput
)
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
assert
generated_text
is
not
None
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
output_json
=
json
.
loads
(
generated_text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
json_schema
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
"model_name, tokenizer_mode"
,
@
pytest
.
mark
.
parametrize
(
"model_name, tokenizer_mode"
,
...
...
tests/v1/structured_output/test_utils.py
View file @
9c4ecf15
...
@@ -13,14 +13,6 @@ def unsupported_string_schemas():
...
@@ -13,14 +13,6 @@ def unsupported_string_schemas():
"type"
:
"string"
,
"type"
:
"string"
,
"pattern"
:
"^[a-zA-Z]+$"
"pattern"
:
"^[a-zA-Z]+$"
},
},
{
"type"
:
"string"
,
"minLength"
:
1
},
{
"type"
:
"string"
,
"maxLength"
:
100
},
{
{
"type"
:
"string"
,
"type"
:
"string"
,
"format"
:
"email"
"format"
:
"email"
...
@@ -164,6 +156,14 @@ def supported_schema():
...
@@ -164,6 +156,14 @@ def supported_schema():
"type"
:
"string"
,
"type"
:
"string"
,
"enum"
:
[
"sedan"
,
"suv"
,
"truck"
]
"enum"
:
[
"sedan"
,
"suv"
,
"truck"
]
},
},
"short_description"
:
{
"type"
:
"string"
,
"maxLength"
:
50
},
"long_description"
:
{
"type"
:
"string"
,
"minLength"
:
50
},
"address"
:
{
"address"
:
{
"type"
:
"object"
,
"type"
:
"object"
,
"properties"
:
{
"properties"
:
{
...
...
tests/v1/test_serial_utils.py
0 → 100644
View file @
9c4ecf15
# SPDX-License-Identifier: Apache-2.0
from
collections
import
UserDict
from
dataclasses
import
dataclass
import
numpy
as
np
import
torch
from
vllm.v1.serial_utils
import
MsgpackDecoder
,
MsgpackEncoder
class
UnrecognizedType
(
UserDict
):
def
__init__
(
self
,
an_int
:
int
):
super
().
__init__
()
self
.
an_int
=
an_int
@
dataclass
class
MyType
:
tensor1
:
torch
.
Tensor
a_string
:
str
list_of_tensors
:
list
[
torch
.
Tensor
]
numpy_array
:
np
.
ndarray
unrecognized
:
UnrecognizedType
small_f_contig_tensor
:
torch
.
Tensor
large_f_contig_tensor
:
torch
.
Tensor
small_non_contig_tensor
:
torch
.
Tensor
large_non_contig_tensor
:
torch
.
Tensor
def
test_encode_decode
():
"""Test encode/decode loop with zero-copy tensors."""
obj
=
MyType
(
tensor1
=
torch
.
randint
(
low
=
0
,
high
=
100
,
size
=
(
1024
,
),
dtype
=
torch
.
int32
),
a_string
=
"hello"
,
list_of_tensors
=
[
torch
.
rand
((
1
,
10
),
dtype
=
torch
.
float32
),
torch
.
rand
((
3
,
5
,
4000
),
dtype
=
torch
.
float64
),
torch
.
tensor
(
1984
),
# test scalar too
],
numpy_array
=
np
.
arange
(
512
),
unrecognized
=
UnrecognizedType
(
33
),
small_f_contig_tensor
=
torch
.
rand
(
5
,
4
).
t
(),
large_f_contig_tensor
=
torch
.
rand
(
1024
,
4
).
t
(),
small_non_contig_tensor
=
torch
.
rand
(
2
,
4
)[:,
1
:
3
],
large_non_contig_tensor
=
torch
.
rand
(
1024
,
512
)[:,
10
:
20
],
)
encoder
=
MsgpackEncoder
()
decoder
=
MsgpackDecoder
(
MyType
)
encoded
=
encoder
.
encode
(
obj
)
# There should be the main buffer + 4 large tensor buffers
# + 1 large numpy array. "large" is <= 512 bytes.
# The two small tensors are encoded inline.
assert
len
(
encoded
)
==
6
decoded
:
MyType
=
decoder
.
decode
(
encoded
)
assert_equal
(
decoded
,
obj
)
# Test encode_into case
preallocated
=
bytearray
()
encoded2
=
encoder
.
encode_into
(
obj
,
preallocated
)
assert
len
(
encoded2
)
==
6
assert
encoded2
[
0
]
is
preallocated
decoded2
:
MyType
=
decoder
.
decode
(
encoded2
)
assert_equal
(
decoded2
,
obj
)
def
assert_equal
(
obj1
:
MyType
,
obj2
:
MyType
):
assert
torch
.
equal
(
obj1
.
tensor1
,
obj2
.
tensor1
)
assert
obj1
.
a_string
==
obj2
.
a_string
assert
all
(
torch
.
equal
(
a
,
b
)
for
a
,
b
in
zip
(
obj1
.
list_of_tensors
,
obj2
.
list_of_tensors
))
assert
np
.
array_equal
(
obj1
.
numpy_array
,
obj2
.
numpy_array
)
assert
obj1
.
unrecognized
.
an_int
==
obj2
.
unrecognized
.
an_int
assert
torch
.
equal
(
obj1
.
small_f_contig_tensor
,
obj2
.
small_f_contig_tensor
)
assert
torch
.
equal
(
obj1
.
large_f_contig_tensor
,
obj2
.
large_f_contig_tensor
)
assert
torch
.
equal
(
obj1
.
small_non_contig_tensor
,
obj2
.
small_non_contig_tensor
)
assert
torch
.
equal
(
obj1
.
large_non_contig_tensor
,
obj2
.
large_non_contig_tensor
)
tests/v1/tpu/test_pallas.py
View file @
9c4ecf15
...
@@ -4,9 +4,7 @@ from unittest.mock import ANY, patch
...
@@ -4,9 +4,7 @@ from unittest.mock import ANY, patch
import
torch
import
torch
from
vllm.attention.backends.abstract
import
AttentionType
from
vllm.attention.backends.abstract
import
AttentionType
from
vllm.v1.attention.backends.pallas
import
(
NUM_KV_PAGES_PER_BLOCK
,
from
vllm.v1.attention.backends.pallas
import
(
PallasAttentionBackendImpl
,
NUM_QUERIES_PER_BLOCK
,
PallasAttentionBackendImpl
,
PallasMetadata
)
PallasMetadata
)
...
@@ -32,8 +30,6 @@ def test_ragged_paged_attention():
...
@@ -32,8 +30,6 @@ def test_ragged_paged_attention():
logits_soft_cap
=
logits_soft_cap
,
logits_soft_cap
=
logits_soft_cap
,
attn_type
=
AttentionType
.
DECODER
,
attn_type
=
AttentionType
.
DECODER
,
)
)
mock_vmem_limit_bytes
=
1024
attn_impl
.
vmem_limit_bytes
=
mock_vmem_limit_bytes
class
FakeAttentionLayer
:
class
FakeAttentionLayer
:
_k_scale_float
:
float
_k_scale_float
:
float
...
@@ -88,9 +84,9 @@ def test_ragged_paged_attention():
...
@@ -88,9 +84,9 @@ def test_ragged_paged_attention():
ANY
,
# block_tables
ANY
,
# block_tables
ANY
,
# query_start_loc
ANY
,
# query_start_loc
ANY
,
# num_seqs
ANY
,
# num_seqs
num_kv_pages_per_block
=
N
UM_KV_PAGES_PER_BLOCK
,
num_kv_pages_per_block
=
N
one
,
num_queries_per_block
=
N
UM_QUERIES_PER_BLOCK
,
num_queries_per_block
=
N
one
,
vmem_limit_bytes
=
mock_vmem_limit_bytes
,
vmem_limit_bytes
=
None
,
use_kernel
=
True
,
use_kernel
=
True
,
sm_scale
=
scale
,
sm_scale
=
scale
,
sliding_window
=
sliding_window
,
sliding_window
=
sliding_window
,
...
...
tests/v1/tpu/test_sampler.py
View file @
9c4ecf15
...
@@ -34,3 +34,8 @@ def test_sampler_different(model_name: str):
...
@@ -34,3 +34,8 @@ def test_sampler_different(model_name: str):
sampling_params
=
SamplingParams
(
temperature
=
0.1
,
min_p
=
0.8
,
max_tokens
=
64
)
sampling_params
=
SamplingParams
(
temperature
=
0.1
,
min_p
=
0.8
,
max_tokens
=
64
)
output2
=
llm
.
generate
(
prompts
,
sampling_params
)
output2
=
llm
.
generate
(
prompts
,
sampling_params
)
assert
output
[
0
].
outputs
[
0
].
text
!=
output2
[
0
].
outputs
[
0
].
text
assert
output
[
0
].
outputs
[
0
].
text
!=
output2
[
0
].
outputs
[
0
].
text
with
pytest
.
raises
(
ValueError
):
# Unsupported `seed` param.
sampling_params
=
SamplingParams
(
temperature
=
0.3
,
seed
=
42
)
output2
=
llm
.
generate
(
prompts
,
sampling_params
)
tests/v1/tpu/worker/test_tpu_model_runner.py
View file @
9c4ecf15
...
@@ -7,9 +7,9 @@ from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
...
@@ -7,9 +7,9 @@ from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
from
vllm.v1.core.sched.output
import
(
CachedRequestData
,
NewRequestData
,
from
vllm.v1.core.sched.output
import
(
CachedRequestData
,
NewRequestData
,
SchedulerOutput
)
SchedulerOutput
)
from
vllm.v1.worker.tpu_model_runner
import
(
TPUModelRunner
,
from
vllm.v1.worker.tpu_model_runner
import
(
_get_padded_token_len
,
TPUModelRunner
,
_get_padded_num_reqs_with_upper_limit
,
_get
_paddings
)
_get_padded_token_len
,
_get_req_paddings
,
_get_token
_paddings
)
# Mock torch_xla module since it may not be available in the test environments
# Mock torch_xla module since it may not be available in the test environments
torch_xla_patcher
=
mock
.
patch
.
dict
(
torch_xla_patcher
=
mock
.
patch
.
dict
(
...
@@ -296,16 +296,29 @@ def test_update_states_request_unscheduled(model_runner):
...
@@ -296,16 +296,29 @@ def test_update_states_request_unscheduled(model_runner):
def
test_get_paddings
():
def
test_get_paddings
():
min_token_size
,
max_token_size
,
padding_gap
=
16
,
512
,
64
min_token_size
,
max_token_size
,
padding_gap
=
16
,
512
,
64
expected_paddings
=
[
16
,
32
,
64
,
128
,
192
,
256
,
320
,
384
,
448
,
512
]
expected_paddings
=
[
16
,
32
,
64
,
128
,
192
,
256
,
320
,
384
,
448
,
512
]
actual_paddings
=
_get_paddings
(
min_token_size
,
max_token_size
,
actual_paddings
=
_get_
token_
paddings
(
min_token_size
,
max_token_size
,
padding_gap
)
padding_gap
)
assert
actual_paddings
==
expected_paddings
assert
actual_paddings
==
expected_paddings
def
test_get_padded_token_len
():
def
test_get_padded_token_len
():
min_token_size
,
max_token_size
,
padding_gap
=
16
,
512
,
64
min_token_size
,
max_token_size
,
padding_gap
=
16
,
512
,
64
paddings
=
_get_paddings
(
min_token_size
,
max_token_size
,
padding_gap
)
paddings
=
_get_
token_
paddings
(
min_token_size
,
max_token_size
,
padding_gap
)
assert
_get_padded_token_len
(
paddings
,
1
)
==
16
assert
_get_padded_token_len
(
paddings
,
1
)
==
16
assert
_get_padded_token_len
(
paddings
,
16
)
==
16
assert
_get_padded_token_len
(
paddings
,
16
)
==
16
assert
_get_padded_token_len
(
paddings
,
20
)
==
32
assert
_get_padded_token_len
(
paddings
,
20
)
==
32
assert
_get_padded_token_len
(
paddings
,
300
)
==
320
assert
_get_padded_token_len
(
paddings
,
300
)
==
320
assert
_get_padded_token_len
(
paddings
,
512
)
==
512
assert
_get_padded_token_len
(
paddings
,
512
)
==
512
def
test_get_padded_num_reqs_with_upper_limit
():
assert
_get_padded_num_reqs_with_upper_limit
(
3
,
32
)
==
8
assert
_get_padded_num_reqs_with_upper_limit
(
9
,
32
)
==
16
assert
_get_padded_num_reqs_with_upper_limit
(
19
,
32
)
==
32
assert
_get_padded_num_reqs_with_upper_limit
(
17
,
28
)
==
28
def
test_get_req_paddings
():
assert
_get_req_paddings
(
1
,
32
)
==
[
8
,
16
,
32
]
assert
_get_req_paddings
(
8
,
32
)
==
[
8
,
16
,
32
]
assert
_get_req_paddings
(
8
,
36
)
==
[
8
,
16
,
32
,
36
]
tools/update-dockerfile-graph.sh
0 → 100755
View file @
9c4ecf15
#!/bin/bash
# Update Dockerfile dependency graph when docker/Dockerfile changes.
# This script is designed to be used as a pre-commit hook.
set
-euo
pipefail
# Check if docker/Dockerfile is staged for commit
if
git diff
--cached
--name-only
|
grep
-q
"^docker/Dockerfile$"
;
then
echo
"docker/Dockerfile has changed, attempting to update dependency graph..."
# Check if Docker is installed and running
if
!
command
-v
docker &> /dev/null
;
then
echo
"Warning: Docker command not found. Skipping Dockerfile graph update."
echo
"Please install Docker to automatically update the graph: https://docs.docker.com/get-docker/"
exit
0
fi
if
!
docker info &> /dev/null
;
then
echo
"Warning: Docker daemon is not running. Skipping Dockerfile graph update."
echo
"Please start Docker to automatically update the graph."
exit
0
fi
# Define the target file path
TARGET_GRAPH_FILE
=
"docs/source/assets/contributing/dockerfile-stages-dependency.png"
# Ensure target directory exists
mkdir
-p
"
$(
dirname
"
$TARGET_GRAPH_FILE
"
)
"
# Store old image hash in a variable if the file exists
OLD_HASH
=
""
if
[
-f
"
$TARGET_GRAPH_FILE
"
]
;
then
OLD_HASH
=
$(
sha256sum
"
$TARGET_GRAPH_FILE
"
)
fi
# Generate Dockerfile graph
echo
"Running dockerfilegraph tool..."
docker run
\
--rm
\
--user
"
$(
id
-u
)
:
$(
id
-g
)
"
\
--workdir
/workspace
\
--volume
"
$(
pwd
)
"
:/workspace
\
ghcr.io/patrickhoefler/dockerfilegraph:alpine
\
--output
png
\
--dpi
200
\
--max-label-length
50
\
--filename
docker/Dockerfile
\
--legend
echo
"Finding generated PNG file..."
# Check for Dockerfile.png in the root directory (most likely location)
if
[
-f
"./Dockerfile.png"
]
;
then
echo
"Found generated file at: ./Dockerfile.png"
mv
"./Dockerfile.png"
"
$TARGET_GRAPH_FILE
"
else
# Try to find it elsewhere
DOCKERFILE_PNG
=
$(
find
.
-name
"Dockerfile.png"
-type
f |
head
-1
)
if
[
-n
"
$DOCKERFILE_PNG
"
]
;
then
echo
"Found generated file at:
$DOCKERFILE_PNG
"
mv
"
$DOCKERFILE_PNG
"
"
$TARGET_GRAPH_FILE
"
else
echo
"Error: Could not find the generated PNG file"
find
.
-name
"*.png"
-type
f
-mmin
-5
exit
1
fi
fi
# Check if the graph has changed
NEW_HASH
=
$(
sha256sum
"
$TARGET_GRAPH_FILE
"
)
if
[
"
$NEW_HASH
"
!=
"
$OLD_HASH
"
]
;
then
echo
"Graph has changed. Please stage the updated file:
$TARGET_GRAPH_FILE
"
exit
1
else
echo
"No changes in graph detected."
fi
fi
exit
0
vllm/_custom_ops.py
View file @
9c4ecf15
...
@@ -138,6 +138,17 @@ def mla_decode_kvcache_cpu(
...
@@ -138,6 +138,17 @@ def mla_decode_kvcache_cpu(
block_tables
,
seq_lens
)
block_tables
,
seq_lens
)
# merge attn states ops
def
merge_attn_states
(
output
:
torch
.
Tensor
,
prefix_output
:
torch
.
Tensor
,
prefix_lse
:
torch
.
Tensor
,
suffix_output
:
torch
.
Tensor
,
suffix_lse
:
torch
.
Tensor
,
output_lse
:
Optional
[
torch
.
Tensor
]
=
None
)
->
None
:
torch
.
ops
.
_C
.
merge_attn_states
(
output
,
output_lse
,
prefix_output
,
prefix_lse
,
suffix_output
,
suffix_lse
)
# pos encoding ops
# pos encoding ops
def
rotary_embedding
(
def
rotary_embedding
(
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
...
...
vllm/attention/backends/flash_attn.py
View file @
9c4ecf15
...
@@ -326,7 +326,7 @@ class FlashAttentionMetadata(AttentionMetadata):
...
@@ -326,7 +326,7 @@ class FlashAttentionMetadata(AttentionMetadata):
assert
self
.
use_cuda_graph
assert
self
.
use_cuda_graph
if
turn_prefills_into_decodes
:
if
turn_prefills_into_decodes
:
# When Mu
t
li-Step is enabled with Chunked-Prefill, prefills and
# When Mul
t
i-Step is enabled with Chunked-Prefill, prefills and
# decodes are scheduled together. In the first step, all the
# decodes are scheduled together. In the first step, all the
# prefills turn into decodes. This update reflects that
# prefills turn into decodes. This update reflects that
# conversion.
# conversion.
...
@@ -617,10 +617,15 @@ class FlashAttentionImpl(AttentionImpl):
...
@@ -617,10 +617,15 @@ class FlashAttentionImpl(AttentionImpl):
blocksparse_params
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
blocksparse_params
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
attn_type
:
str
=
AttentionType
.
DECODER
,
attn_type
:
str
=
AttentionType
.
DECODER
,
use_irope
:
bool
=
False
,
)
->
None
:
)
->
None
:
if
blocksparse_params
is
not
None
:
if
blocksparse_params
is
not
None
:
raise
ValueError
(
raise
ValueError
(
"FlashAttention does not support block-sparse attention."
)
"FlashAttention does not support block-sparse attention."
)
if
use_irope
:
logger
.
warning
(
"Using irope in V0 is not supported yet, it will fall back "
"to global attention for long context."
)
self
.
num_heads
=
num_heads
self
.
num_heads
=
num_heads
self
.
head_size
=
head_size
self
.
head_size
=
head_size
self
.
scale
=
float
(
scale
)
self
.
scale
=
float
(
scale
)
...
...
vllm/attention/backends/flashinfer.py
View file @
9c4ecf15
...
@@ -38,9 +38,12 @@ from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
...
@@ -38,9 +38,12 @@ from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
from
vllm.attention.layer
import
Attention
from
vllm.attention.layer
import
Attention
from
vllm.attention.ops.paged_attn
import
PagedAttention
from
vllm.attention.ops.paged_attn
import
PagedAttention
from
vllm.config
import
VllmConfig
,
get_current_vllm_config
from
vllm.config
import
VllmConfig
,
get_current_vllm_config
from
vllm.logger
import
init_logger
from
vllm.utils
import
(
async_tensor_h2d
,
get_kv_cache_torch_dtype
,
from
vllm.utils
import
(
async_tensor_h2d
,
get_kv_cache_torch_dtype
,
make_tensor_with_pad
)
make_tensor_with_pad
)
logger
=
init_logger
(
__name__
)
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
from
vllm.worker.model_runner
import
(
ModelInputForGPUBuilder
,
from
vllm.worker.model_runner
import
(
ModelInputForGPUBuilder
,
ModelInputForGPUWithSamplingMetadata
)
ModelInputForGPUWithSamplingMetadata
)
...
@@ -907,7 +910,12 @@ class FlashInferImpl(AttentionImpl):
...
@@ -907,7 +910,12 @@ class FlashInferImpl(AttentionImpl):
blocksparse_params
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
blocksparse_params
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
attn_type
:
str
=
AttentionType
.
DECODER
,
attn_type
:
str
=
AttentionType
.
DECODER
,
use_irope
:
bool
=
False
,
)
->
None
:
)
->
None
:
if
use_irope
:
logger
.
warning_once
(
"Using irope in FlashInfer is not supported yet, it will fall"
" back to global attention for long context."
)
self
.
num_heads
=
num_heads
self
.
num_heads
=
num_heads
self
.
head_size
=
head_size
self
.
head_size
=
head_size
self
.
scale
=
float
(
scale
)
self
.
scale
=
float
(
scale
)
...
...
vllm/attention/backends/hpu_attn.py
View file @
9c4ecf15
...
@@ -108,8 +108,13 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
...
@@ -108,8 +108,13 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
blocksparse_params
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
blocksparse_params
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
max_seq_len
:
int
=
4096
,
max_seq_len
:
int
=
4096
,
attn_type
:
str
=
AttentionType
.
DECODER
,
attn_type
:
str
=
AttentionType
.
DECODER
,
use_irope
:
bool
=
False
,
)
->
None
:
)
->
None
:
super
(
AttentionImpl
,
self
).
__init__
()
super
(
AttentionImpl
,
self
).
__init__
()
if
use_irope
:
logger
.
warning_once
(
"Using irope in HPU is not supported yet, it will fall back "
"to global attention for long context."
)
self
.
kv_cache_dtype
=
kv_cache_dtype
self
.
kv_cache_dtype
=
kv_cache_dtype
self
.
num_heads
=
num_heads
self
.
num_heads
=
num_heads
self
.
head_size
=
head_size
self
.
head_size
=
head_size
...
@@ -144,14 +149,14 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
...
@@ -144,14 +149,14 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
self
.
fused_scaled_dot_product_attention
=
ModuleFusedSDPA
(
self
.
fused_scaled_dot_product_attention
=
ModuleFusedSDPA
(
FusedSDPA
)
FusedSDPA
)
except
ImportError
:
except
ImportError
:
logger
()
.
warning
(
"Could not import HPU FusedSDPA kernel. "
logger
.
warning
(
"Could not import HPU FusedSDPA kernel. "
"vLLM will use native implementation."
)
"vLLM will use native implementation."
)
suppored_head_sizes
=
HPUPagedAttention
.
get_supported_head_sizes
()
suppor
t
ed_head_sizes
=
HPUPagedAttention
.
get_supported_head_sizes
()
if
head_size
not
in
suppored_head_sizes
:
if
head_size
not
in
suppor
t
ed_head_sizes
:
raise
ValueError
(
raise
ValueError
(
f
"Head size
{
head_size
}
is not supported by PagedAttention. "
f
"Head size
{
head_size
}
is not supported by PagedAttention. "
f
"Supported head sizes are:
{
suppored_head_sizes
}
."
)
f
"Supported head sizes are:
{
suppor
t
ed_head_sizes
}
."
)
if
attn_type
!=
AttentionType
.
DECODER
:
if
attn_type
!=
AttentionType
.
DECODER
:
raise
NotImplementedError
(
"Encoder self-attention and "
raise
NotImplementedError
(
"Encoder self-attention and "
...
...
Prev
1
…
4
5
6
7
8
9
10
11
12
…
18
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment