Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
73cfb7a7
Unverified
Commit
73cfb7a7
authored
Dec 23, 2025
by
Weida Hong
Committed by
GitHub
Dec 23, 2025
Browse files
Correct position of docstring of class attributes (#31209)
Signed-off-by:
Weida Hong
<
wdhongtw@google.com
>
parent
f32cfd7d
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
13 additions
and
10 deletions
+13
-10
vllm/forward_context.py
vllm/forward_context.py
+1
-1
vllm/v1/kv_cache_interface.py
vllm/v1/kv_cache_interface.py
+12
-9
No files found.
vllm/forward_context.py
View file @
73cfb7a7
...
...
@@ -186,6 +186,7 @@ class DPMetadata:
class
ForwardContext
:
# copy from vllm_config.compilation_config.static_forward_context
no_compile_layers
:
dict
[
str
,
Any
]
attn_metadata
:
dict
[
str
,
AttentionMetadata
]
|
list
[
dict
[
str
,
AttentionMetadata
]]
"""
Type Dict[str, AttentionMetadata] for v1, map from layer_name of each
attention layer to its attention metadata
...
...
@@ -193,7 +194,6 @@ class ForwardContext:
for each microbatch.
Set dynamically for each forward pass
"""
attn_metadata
:
dict
[
str
,
AttentionMetadata
]
|
list
[
dict
[
str
,
AttentionMetadata
]]
# TODO: remove after making all virtual_engines share the same kv cache
virtual_engine
:
int
# set dynamically for each forward pass
# set dynamically for each forward pass
...
...
vllm/v1/kv_cache_interface.py
View file @
73cfb7a7
...
...
@@ -80,8 +80,6 @@ class AttentionSpec(KVCacheSpec):
@
dataclass
(
frozen
=
True
)
class
FullAttentionSpec
(
AttentionSpec
):
sliding_window
:
int
|
None
=
None
attention_chunk_size
:
int
|
None
=
None
"""
When hybrid allocator is disabled and the model contains both full
attention layers and sliding window attention layers, sliding
...
...
@@ -89,8 +87,13 @@ class FullAttentionSpec(AttentionSpec):
(blocks are allocated for all tokens), while computed as sliding window
attention in model runner.
In this case, we use FullAttentionSpec and record the sliding window size.
"""
sliding_window
:
int
|
None
=
None
"""
Default to None for not using sliding window attention.
"""
attention_chunk_size
:
int
|
None
=
None
def
max_memory_usage_bytes
(
self
,
vllm_config
:
VllmConfig
)
->
int
:
max_model_len
=
vllm_config
.
model_config
.
max_model_len
...
...
@@ -390,10 +393,11 @@ class KVCacheConfig:
The KV cache configuration of a model.
"""
"""The number of KV cache blocks"""
num_blocks
:
int
"""
How should model runner initialize the KV cache tensors for each layer
"""
"""
The number of KV cache blocks
"""
kv_cache_tensors
:
list
[
KVCacheTensor
]
"""How should model runner initialize the KV cache tensors for each layer"""
kv_cache_groups
:
list
[
KVCacheGroupSpec
]
"""
The kv cache groups of the model.
For models with only one type of attention, there is only one group that
...
...
@@ -401,4 +405,3 @@ class KVCacheConfig:
For models with multiple types of attention, there will be multiple groups,
see `_get_kv_cache_config_uniform_page_size` for more details.
"""
kv_cache_groups
:
list
[
KVCacheGroupSpec
]
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment