Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
8ee90c83
Unverified
Commit
8ee90c83
authored
Dec 24, 2025
by
Michael Goin
Committed by
GitHub
Dec 23, 2025
Browse files
Add `--max-model-len auto` to auto-fit context to available memory (#29431)
Signed-off-by:
mgoin
<
mgoin64@gmail.com
>
parent
d7e05ac7
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
315 additions
and
33 deletions
+315
-33
tests/engine/test_arg_utils.py
tests/engine/test_arg_utils.py
+10
-0
tests/v1/core/test_kv_cache_utils.py
tests/v1/core/test_kv_cache_utils.py
+57
-0
vllm/config/model.py
vllm/config/model.py
+8
-4
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+7
-0
vllm/v1/core/kv_cache_utils.py
vllm/v1/core/kv_cache_utils.py
+209
-29
vllm/v1/engine/core.py
vllm/v1/engine/core.py
+11
-0
vllm/v1/worker/gpu_worker.py
vllm/v1/worker/gpu_worker.py
+13
-0
No files found.
tests/engine/test_arg_utils.py
View file @
8ee90c83
...
@@ -511,6 +511,16 @@ def test_human_readable_model_len():
...
@@ -511,6 +511,16 @@ def test_human_readable_model_len():
args
=
parser
.
parse_args
([
"--max-model-len"
,
"10.2123451234567t"
])
args
=
parser
.
parse_args
([
"--max-model-len"
,
"10.2123451234567t"
])
assert
args
.
max_model_len
==
10212345123456
assert
args
.
max_model_len
==
10212345123456
# Special value -1 for auto-fit to GPU memory
args
=
parser
.
parse_args
([
"--max-model-len"
,
"-1"
])
assert
args
.
max_model_len
==
-
1
# 'auto' is an alias for -1
args
=
parser
.
parse_args
([
"--max-model-len"
,
"auto"
])
assert
args
.
max_model_len
==
-
1
args
=
parser
.
parse_args
([
"--max-model-len"
,
"AUTO"
])
assert
args
.
max_model_len
==
-
1
# Invalid (do not allow decimals with binary multipliers)
# Invalid (do not allow decimals with binary multipliers)
for
invalid
in
[
"1a"
,
"pwd"
,
"10.24"
,
"1.23M"
,
"1.22T"
]:
for
invalid
in
[
"1a"
,
"pwd"
,
"10.24"
,
"1.23M"
,
"1.22T"
]:
with
pytest
.
raises
(
ArgumentError
):
with
pytest
.
raises
(
ArgumentError
):
...
...
tests/v1/core/test_kv_cache_utils.py
View file @
8ee90c83
...
@@ -1798,3 +1798,60 @@ def test_request_with_prompt_embeds_and_mm_inputs(hash_fn: Callable[[Any], bytes
...
@@ -1798,3 +1798,60 @@ def test_request_with_prompt_embeds_and_mm_inputs(hash_fn: Callable[[Any], bytes
)
)
)
)
assert
block_hashes
[
1
]
==
expected_hash2
assert
block_hashes
[
1
]
==
expected_hash2
def
test_auto_fit_max_model_len
():
"""Test that max_model_len=-1 auto-fits to available GPU memory."""
# Create config with original_max_model_len=-1 to trigger auto-fit
model_config
=
ModelConfig
(
max_model_len
=
1024
)
# Simulate the user passing -1 by setting original_max_model_len
model_config
.
original_max_model_len
=
-
1
vllm_config
=
VllmConfig
(
model_config
=
model_config
)
mem_per_block_per_layer
=
16
*
2
*
64
*
4
*
2
# 16KB per block per layer
kv_cache_specs
=
{
"layer_1"
:
new_kv_cache_spec
(),
"layer_2"
:
new_kv_cache_spec
(),
}
# With enough memory, max_model_len stays at the derived max
large_available_memory
=
mem_per_block_per_layer
*
2
*
1024
# plenty of memory
_kv_cache_configs
=
get_kv_cache_configs
(
vllm_config
,
[
kv_cache_specs
],
[
large_available_memory
]
)
assert
vllm_config
.
model_config
.
max_model_len
==
1024
# Reset for next test
model_config
=
ModelConfig
(
max_model_len
=
1024
)
model_config
.
original_max_model_len
=
-
1
vllm_config
=
VllmConfig
(
model_config
=
model_config
)
# With limited memory, max_model_len should be reduced
# Need memory for at least max_model_len tokens
# 32 blocks worth of memory for 2 layers = can fit 32*16=512 tokens
limited_memory
=
mem_per_block_per_layer
*
2
*
32
_kv_cache_configs
=
get_kv_cache_configs
(
vllm_config
,
[
kv_cache_specs
],
[
limited_memory
]
)
# Should be reduced to fit in memory
assert
vllm_config
.
model_config
.
max_model_len
<
1024
assert
vllm_config
.
model_config
.
max_model_len
>
0
def
test_auto_fit_max_model_len_not_triggered
():
"""Test that auto-fit is not triggered when original_max_model_len is not -1."""
model_config
=
ModelConfig
(
max_model_len
=
16
)
# original_max_model_len should be None by default, not -1
vllm_config
=
VllmConfig
(
model_config
=
model_config
)
mem_per_block_per_layer
=
16
*
2
*
64
*
4
*
2
kv_cache_specs
=
{
"layer_1"
:
new_kv_cache_spec
(),
"layer_2"
:
new_kv_cache_spec
(),
}
# This should work normally without auto-fit
_kv_cache_configs
=
get_kv_cache_configs
(
vllm_config
,
[
kv_cache_specs
],
[
mem_per_block_per_layer
*
2
*
32
]
)
assert
vllm_config
.
model_config
.
max_model_len
==
16
vllm/config/model.py
View file @
8ee90c83
...
@@ -172,7 +172,10 @@ class ModelConfig:
...
@@ -172,7 +172,10 @@ class ModelConfig:
format. Examples:
\n
format. Examples:
\n
- 1k -> 1000
\n
- 1k -> 1000
\n
- 1K -> 1024
\n
- 1K -> 1024
\n
- 25.6k -> 25,600"""
- 25.6k -> 25,600
\n
- -1 or 'auto' -> Automatically choose the maximum model length that fits in
GPU memory. This will use the model's maximum context length if it fits,
otherwise it will find the largest length that can be accommodated."""
spec_target_max_model_len
:
int
|
None
=
None
spec_target_max_model_len
:
int
|
None
=
None
"""Specify the maximum length for spec decoding draft models."""
"""Specify the maximum length for spec decoding draft models."""
quantization
:
QuantizationMethods
|
str
|
None
=
None
quantization
:
QuantizationMethods
|
str
|
None
=
None
...
@@ -2151,9 +2154,10 @@ def _get_and_verify_max_len(
...
@@ -2151,9 +2154,10 @@ def _get_and_verify_max_len(
if
encoder_config
and
"max_seq_length"
in
encoder_config
:
if
encoder_config
and
"max_seq_length"
in
encoder_config
:
derived_max_model_len
=
encoder_config
[
"max_seq_length"
]
derived_max_model_len
=
encoder_config
[
"max_seq_length"
]
# If the user didn't specify `max_model_len`, then use that derived from
# If the user didn't specify `max_model_len` or specified -1 (auto-fit),
# the model config as a default value.
# then use that derived from the model config as a default value.
if
max_model_len
is
None
:
# When -1 is specified, the engine will later auto-fit to available memory.
if
max_model_len
is
None
or
max_model_len
==
-
1
:
# For LongRoPE, default to original_max_position_embeddings to avoid
# For LongRoPE, default to original_max_position_embeddings to avoid
# performance degradation for shorter sequences
# performance degradation for shorter sequences
if
rope_parameters
is
not
None
and
any
(
if
rope_parameters
is
not
None
and
any
(
...
...
vllm/engine/arg_utils.py
View file @
8ee90c83
...
@@ -2045,13 +2045,20 @@ def _raise_unsupported_error(feature_name: str):
...
@@ -2045,13 +2045,20 @@ def _raise_unsupported_error(feature_name: str):
def
human_readable_int
(
value
):
def
human_readable_int
(
value
):
"""Parse human-readable integers like '1k', '2M', etc.
"""Parse human-readable integers like '1k', '2M', etc.
Including decimal values with decimal multipliers.
Including decimal values with decimal multipliers.
Also accepts -1 or 'auto' as a special value for auto-detection.
Examples:
Examples:
- '1k' -> 1,000
- '1k' -> 1,000
- '1K' -> 1,024
- '1K' -> 1,024
- '25.6k' -> 25,600
- '25.6k' -> 25,600
- '-1' or 'auto' -> -1 (special value for auto-detection)
"""
"""
value
=
value
.
strip
()
value
=
value
.
strip
()
# Handle -1 or 'auto' as a special value for auto-detection
if
value
==
"-1"
or
value
.
lower
()
==
"auto"
:
return
-
1
match
=
re
.
fullmatch
(
r
"(\d+(?:\.\d+)?)([kKmMgGtT])"
,
value
)
match
=
re
.
fullmatch
(
r
"(\d+(?:\.\d+)?)([kKmMgGtT])"
,
value
)
if
match
:
if
match
:
decimal_multiplier
=
{
decimal_multiplier
=
{
...
...
vllm/v1/core/kv_cache_utils.py
View file @
8ee90c83
...
@@ -624,6 +624,9 @@ def estimate_max_model_len(
...
@@ -624,6 +624,9 @@ def estimate_max_model_len(
Estimates the maximum model length that can fit in the available memory
Estimates the maximum model length that can fit in the available memory
using binary search.
using binary search.
This function temporarily modifies max_model_len during estimation but
restores the original value before returning, ensuring no side effects.
Args:
Args:
vllm_config: The global VllmConfig
vllm_config: The global VllmConfig
kv_cache_spec: The kv cache spec of each attention layer in the model
kv_cache_spec: The kv cache spec of each attention layer in the model
...
@@ -632,18 +635,20 @@ def estimate_max_model_len(
...
@@ -632,18 +635,20 @@ def estimate_max_model_len(
Returns:
Returns:
The estimated maximum model length that can fit in the available memory.
The estimated maximum model length that can fit in the available memory.
"""
"""
# Save the original max_model_len to restore after estimation
original_max_model_len
=
vllm_config
.
model_config
.
max_model_len
# Define a function to check if a given model length fits in memory
# Define a function to check if a given model length fits in memory
def
fits_in_memory
(
model_len
:
int
)
->
bool
:
def
fits_in_memory
(
model_len
:
int
)
->
bool
:
#
M
odify the max_model_len for this calculation
#
Temporarily m
odify the max_model_len for this calculation
vllm_config
.
model_config
.
max_model_len
=
model_len
vllm_config
.
model_config
.
max_model_len
=
model_len
# Calculate memory needed for the given model length
# Calculate memory needed for the given model length
memory_needed
=
max_memory_usage_bytes
(
vllm_config
,
kv_cache_spec
.
values
())
memory_needed
=
max_memory_usage_bytes
(
vllm_config
,
kv_cache_spec
.
values
())
return
memory_needed
<=
available_memory
return
memory_needed
<=
available_memory
try
:
# Binary search for the maximum model length
# Binary search for the maximum model length
current_max
=
vllm_config
.
model_config
.
max_model_len
left
,
right
=
1
,
original_max_model_len
left
,
right
=
1
,
current_max
# If even the smallest model length doesn't fit, return 0
# If even the smallest model length doesn't fit, return 0
if
not
fits_in_memory
(
left
):
if
not
fits_in_memory
(
left
):
...
@@ -659,6 +664,9 @@ def estimate_max_model_len(
...
@@ -659,6 +664,9 @@ def estimate_max_model_len(
else
:
else
:
right
=
mid
-
1
right
=
mid
-
1
return
result
return
result
finally
:
# Always restore the original max_model_len to avoid side effects
vllm_config
.
model_config
.
max_model_len
=
original_max_model_len
def
check_enough_kv_cache_memory
(
def
check_enough_kv_cache_memory
(
...
@@ -1301,6 +1309,140 @@ def _report_kv_cache_config(
...
@@ -1301,6 +1309,140 @@ def _report_kv_cache_config(
)
)
def
_max_memory_usage_bytes_from_groups
(
vllm_config
:
VllmConfig
,
kv_cache_groups
:
list
[
KVCacheGroupSpec
],
)
->
int
:
"""
Calculate maximum memory usage in bytes from KV cache groups.
This correctly accounts for padding in hybrid models. For example, if a
model has 8 full attention layers and 9 sliding window layers, they will
be padded to 9 full + 9 sliding window for uniform group sizes.
"""
if
not
kv_cache_groups
:
return
0
# UniformTypeKVCacheSpecs special case (single group, per-layer specs)
if
len
(
kv_cache_groups
)
==
1
and
isinstance
(
kv_cache_groups
[
0
].
kv_cache_spec
,
UniformTypeKVCacheSpecs
):
per_layer_specs
=
kv_cache_groups
[
0
].
kv_cache_spec
.
kv_cache_specs
return
sum
(
spec
.
max_memory_usage_bytes
(
vllm_config
)
for
spec
in
per_layer_specs
.
values
()
)
# General case: group_size pools, each shared by one layer per group
# Memory = group_size * page_size * blocks_for_max_len
group_size
=
max
(
len
(
group
.
layer_names
)
for
group
in
kv_cache_groups
)
page_size
=
get_uniform_page_size
(
[
group
.
kv_cache_spec
for
group
in
kv_cache_groups
]
)
any_spec
=
kv_cache_groups
[
0
].
kv_cache_spec
blocks_needed
=
cdiv
(
any_spec
.
max_memory_usage_bytes
(
vllm_config
),
page_size
)
return
group_size
*
page_size
*
blocks_needed
def
_estimate_max_model_len_from_groups
(
vllm_config
:
VllmConfig
,
kv_cache_groups
:
list
[
KVCacheGroupSpec
],
available_memory
:
int
,
)
->
int
:
"""
Binary search for the maximum model length that fits in available memory.
Returns 0 if even 1 token doesn't fit.
"""
original_max
=
vllm_config
.
model_config
.
max_model_len
def
fits
(
model_len
:
int
)
->
bool
:
vllm_config
.
model_config
.
max_model_len
=
model_len
return
(
_max_memory_usage_bytes_from_groups
(
vllm_config
,
kv_cache_groups
)
<=
available_memory
)
try
:
left
,
right
=
1
,
original_max
if
not
fits
(
left
):
return
0
result
=
1
while
left
<=
right
:
mid
=
(
left
+
right
)
//
2
if
fits
(
mid
):
result
=
mid
left
=
mid
+
1
else
:
right
=
mid
-
1
return
result
finally
:
vllm_config
.
model_config
.
max_model_len
=
original_max
def
_auto_fit_max_model_len
(
vllm_config
:
VllmConfig
,
kv_cache_groups
:
list
[
KVCacheGroupSpec
],
available_memory
:
list
[
int
],
)
->
None
:
"""
When max_model_len is set to -1, this function estimates the largest
context length that can be supported with the available GPU memory.
It uses binary search to find the maximum length that fits across all
workers.
Args:
vllm_config: The global VllmConfig (will be modified in-place)
kv_cache_groups: The global KV cache groups (from get_kv_cache_groups).
This correctly accounts for padding in hybrid models.
available_memory: Memory available for KV cache in bytes for each
worker.
"""
original_max
=
vllm_config
.
model_config
.
max_model_len
if
not
kv_cache_groups
:
# All workers have empty specs (attention-free model)
logger
.
info_once
(
"Auto-fit max_model_len: attention-free model, "
"using derived max_model_len=%d"
,
original_max
,
scope
=
"local"
,
)
return
# Use minimum available memory across all workers
min_available_memory
=
min
(
available_memory
)
auto_fit_max
=
_estimate_max_model_len_from_groups
(
vllm_config
,
kv_cache_groups
,
min_available_memory
)
if
auto_fit_max
<=
0
:
raise
ValueError
(
"Cannot auto-fit max_model_len: not enough GPU memory available "
"to serve even a single token. Try increasing `gpu_memory_utilization`."
)
if
auto_fit_max
>=
original_max
:
# The model's full context length fits in memory
logger
.
info_once
(
"Auto-fit max_model_len: full model context length %d fits in "
"available GPU memory"
,
original_max
,
scope
=
"local"
,
)
else
:
# Need to reduce max_model_len to fit in memory
vllm_config
.
model_config
.
max_model_len
=
auto_fit_max
logger
.
info_once
(
"Auto-fit max_model_len: reduced from %d to %d to fit in "
"available GPU memory (%.2f GiB available for KV cache)"
,
original_max
,
auto_fit_max
,
min_available_memory
/
GiB_bytes
,
scope
=
"local"
,
)
def
get_kv_cache_configs
(
def
get_kv_cache_configs
(
vllm_config
:
VllmConfig
,
vllm_config
:
VllmConfig
,
kv_cache_specs
:
list
[
dict
[
str
,
KVCacheSpec
]],
kv_cache_specs
:
list
[
dict
[
str
,
KVCacheSpec
]],
...
@@ -1317,10 +1459,12 @@ def get_kv_cache_configs(
...
@@ -1317,10 +1459,12 @@ def get_kv_cache_configs(
1. Merge the KV cache specs of all workers to get the KVCacheSpecs for
1. Merge the KV cache specs of all workers to get the KVCacheSpecs for
the whole model.
the whole model.
2. Generate the KV cache groups based on the layer ratio of the whole model.
2. Generate the KV cache groups based on the layer ratio of the whole model.
3. Generate the KV cache configs for each worker based on the KV cache
This also handles spec unification for hybrid models.
3. Handle auto-fit max_model_len and memory checks using the unified specs.
4. Generate the KV cache configs for each worker based on the KV cache
grouping strategy. (This is reasonable because the layer ratio of
grouping strategy. (This is reasonable because the layer ratio of
different PP stages are similar.)
different PP stages are similar.)
4
. Change the num_blocks of each worker to the smallest among all workers
5
. Change the num_blocks of each worker to the smallest among all workers
and shrink tensor sizes proportionally to avoid allocating unused memory.
and shrink tensor sizes proportionally to avoid allocating unused memory.
Args:
Args:
...
@@ -1333,14 +1477,6 @@ def get_kv_cache_configs(
...
@@ -1333,14 +1477,6 @@ def get_kv_cache_configs(
The generated KVCacheConfigs for each worker.
The generated KVCacheConfigs for each worker.
"""
"""
# Check if the available memory is enough for each worker.
for
kv_cache_spec_one_worker
,
available_memory_one_worker
in
zip
(
kv_cache_specs
,
available_memory
):
check_enough_kv_cache_memory
(
vllm_config
,
kv_cache_spec_one_worker
,
available_memory_one_worker
)
# Merge the KV cache specs of all workers. Different PP stages may have
# Merge the KV cache specs of all workers. Different PP stages may have
# different layer names, and different TP ranks of the same PP stage should
# different layer names, and different TP ranks of the same PP stage should
# have the same KV cache spec.
# have the same KV cache spec.
...
@@ -1354,8 +1490,52 @@ def get_kv_cache_configs(
...
@@ -1354,8 +1490,52 @@ def get_kv_cache_configs(
"The KV cache specs for the same layer are different "
"The KV cache specs for the same layer are different "
"across workers. This is not supported yet."
"across workers. This is not supported yet."
)
)
# Get global KV cache groups. This also handles spec unification for
# hybrid models when disable_hybrid_kv_cache_manager is enabled.
# After this call, merged_kv_cache_specs may be modified in-place.
global_kv_cache_groups
=
get_kv_cache_groups
(
vllm_config
,
merged_kv_cache_specs
)
global_kv_cache_groups
=
get_kv_cache_groups
(
vllm_config
,
merged_kv_cache_specs
)
# If original_max_model_len was -1, automatically
# determine the maximum model length that fits in available GPU memory.
# We use the global groups here to correctly account for padding.
if
vllm_config
.
model_config
.
original_max_model_len
==
-
1
:
_auto_fit_max_model_len
(
vllm_config
,
global_kv_cache_groups
,
available_memory
)
# Check if the available memory is enough (using min across all workers).
# We use the global groups to correctly account for padding.
if
global_kv_cache_groups
:
min_available_memory
=
min
(
available_memory
)
if
min_available_memory
<=
0
:
raise
ValueError
(
"No available memory for the cache blocks. "
"Try increasing `gpu_memory_utilization` when "
"initializing the engine."
)
max_model_len
=
vllm_config
.
model_config
.
max_model_len
needed_memory
=
_max_memory_usage_bytes_from_groups
(
vllm_config
,
global_kv_cache_groups
)
if
needed_memory
>
min_available_memory
:
estimated_max_len
=
_estimate_max_model_len_from_groups
(
vllm_config
,
global_kv_cache_groups
,
min_available_memory
)
estimated_msg
=
""
if
estimated_max_len
>
0
:
estimated_msg
=
(
f
"Based on the available memory, the estimated maximum "
f
"model length is
{
estimated_max_len
}
. "
)
raise
ValueError
(
f
"To serve at least one request with the models's max seq len "
f
"(
{
max_model_len
}
), (
{
needed_memory
/
GiB_bytes
:.
2
f
}
GiB KV "
f
"cache is needed, which is larger than the available KV cache "
f
"memory (
{
min_available_memory
/
GiB_bytes
:.
2
f
}
GiB). "
f
"
{
estimated_msg
}
"
f
"Try increasing `gpu_memory_utilization` or decreasing "
f
"`max_model_len` when initializing the engine."
)
kv_cache_configs
:
list
[
KVCacheConfig
]
=
[]
kv_cache_configs
:
list
[
KVCacheConfig
]
=
[]
for
kv_cache_spec_one_worker
,
available_memory_one_worker
in
zip
(
for
kv_cache_spec_one_worker
,
available_memory_one_worker
in
zip
(
kv_cache_specs
,
available_memory
kv_cache_specs
,
available_memory
...
...
vllm/v1/engine/core.py
View file @
8ee90c83
...
@@ -247,9 +247,20 @@ class EngineCore:
...
@@ -247,9 +247,20 @@ class EngineCore:
assert
len
(
kv_cache_specs
)
==
len
(
available_gpu_memory
)
assert
len
(
kv_cache_specs
)
==
len
(
available_gpu_memory
)
# Track max_model_len before KV cache config to detect auto-fit changes
max_model_len_before
=
vllm_config
.
model_config
.
max_model_len
kv_cache_configs
=
get_kv_cache_configs
(
kv_cache_configs
=
get_kv_cache_configs
(
vllm_config
,
kv_cache_specs
,
available_gpu_memory
vllm_config
,
kv_cache_specs
,
available_gpu_memory
)
)
# If auto-fit reduced max_model_len, sync the new value to workers.
# This is needed because workers were spawned before memory profiling
# and have the original (larger) max_model_len cached.
max_model_len_after
=
vllm_config
.
model_config
.
max_model_len
if
max_model_len_after
!=
max_model_len_before
:
self
.
collective_rpc
(
"update_max_model_len"
,
args
=
(
max_model_len_after
,))
scheduler_kv_cache_config
=
generate_scheduler_kv_cache_config
(
kv_cache_configs
)
scheduler_kv_cache_config
=
generate_scheduler_kv_cache_config
(
kv_cache_configs
)
num_gpu_blocks
=
scheduler_kv_cache_config
.
num_blocks
num_gpu_blocks
=
scheduler_kv_cache_config
.
num_blocks
num_cpu_blocks
=
0
num_cpu_blocks
=
0
...
...
vllm/v1/worker/gpu_worker.py
View file @
8ee90c83
...
@@ -387,6 +387,19 @@ class Worker(WorkerBase):
...
@@ -387,6 +387,19 @@ class Worker(WorkerBase):
def
get_kv_cache_spec
(
self
)
->
dict
[
str
,
KVCacheSpec
]:
def
get_kv_cache_spec
(
self
)
->
dict
[
str
,
KVCacheSpec
]:
return
self
.
model_runner
.
get_kv_cache_spec
()
return
self
.
model_runner
.
get_kv_cache_spec
()
def
update_max_model_len
(
self
,
max_model_len
:
int
)
->
None
:
"""Update max_model_len after auto-fit to GPU memory.
This is called when max_model_len=-1 is used and the engine
automatically determines the maximum context length that fits
in GPU memory. Workers need to update their cached max_model_len
to match the engine's decision.
"""
self
.
model_config
.
max_model_len
=
max_model_len
if
self
.
model_runner
is
not
None
:
self
.
model_runner
.
max_model_len
=
max_model_len
logger
.
debug
(
"Updated max_model_len to %d"
,
max_model_len
)
def
initialize_from_config
(
self
,
kv_cache_config
:
KVCacheConfig
)
->
None
:
def
initialize_from_config
(
self
,
kv_cache_config
:
KVCacheConfig
)
->
None
:
"""Allocate GPU KV cache with the specified kv_cache_config."""
"""Allocate GPU KV cache with the specified kv_cache_config."""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment