Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
b37b6797
Unverified
Commit
b37b6797
authored
Feb 13, 2026
by
Wei Zhao
Committed by
GitHub
Feb 13, 2026
Browse files
[Feature][Perf] Support Selective CPU Weight Offloading (#34535)
Signed-off-by:
wzhao18
<
wzhao18.sz@gmail.com
>
parent
a0638d05
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
44 additions
and
2 deletions
+44
-2
vllm/config/cache.py
vllm/config/cache.py
+11
-0
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+5
-0
vllm/model_executor/models/utils.py
vllm/model_executor/models/utils.py
+23
-1
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+5
-1
No files found.
vllm/config/cache.py
View file @
b37b6797
...
@@ -101,6 +101,17 @@ class CacheConfig:
...
@@ -101,6 +101,17 @@ class CacheConfig:
Note that this requires fast CPU-GPU interconnect, as part of the model is
Note that this requires fast CPU-GPU interconnect, as part of the model is
loaded from CPU memory to GPU memory on the fly in each model forward pass.
loaded from CPU memory to GPU memory on the fly in each model forward pass.
"""
"""
cpu_offload_params
:
set
[
str
]
=
Field
(
default_factory
=
set
)
""" The set of parameter name segments to target for CPU offloading.
Unmatched parameters are not offloaded. If this set is empty, parameters
are offloaded non-selectively until the memory limit defined by
`cpu_offload_gb` is reached.
Examples:
- For parameter name "mlp.experts.w2_weight":
- "experts" or "experts.w2_weight" will match.
- "expert" or "w2" will NOT match (must be exact segments).
This allows distinguishing parameters like "w2_weight" and "w2_weight_scale".
"""
calculate_kv_scales
:
bool
=
False
calculate_kv_scales
:
bool
=
False
"""This enables dynamic calculation of `k_scale` and `v_scale` when
"""This enables dynamic calculation of `k_scale` and `v_scale` when
kv_cache_dtype is fp8. If `False`, the scales will be loaded from the model
kv_cache_dtype is fp8. If `False`, the scales will be loaded from the model
...
...
vllm/engine/arg_utils.py
View file @
b37b6797
...
@@ -434,6 +434,7 @@ class EngineArgs:
...
@@ -434,6 +434,7 @@ class EngineArgs:
disable_cascade_attn
:
bool
=
ModelConfig
.
disable_cascade_attn
disable_cascade_attn
:
bool
=
ModelConfig
.
disable_cascade_attn
swap_space
:
float
=
CacheConfig
.
swap_space
swap_space
:
float
=
CacheConfig
.
swap_space
cpu_offload_gb
:
float
=
CacheConfig
.
cpu_offload_gb
cpu_offload_gb
:
float
=
CacheConfig
.
cpu_offload_gb
cpu_offload_params
:
set
[
str
]
=
get_field
(
CacheConfig
,
"cpu_offload_params"
)
gpu_memory_utilization
:
float
=
CacheConfig
.
gpu_memory_utilization
gpu_memory_utilization
:
float
=
CacheConfig
.
gpu_memory_utilization
kv_cache_memory_bytes
:
int
|
None
=
CacheConfig
.
kv_cache_memory_bytes
kv_cache_memory_bytes
:
int
|
None
=
CacheConfig
.
kv_cache_memory_bytes
max_num_batched_tokens
:
int
|
None
=
None
max_num_batched_tokens
:
int
|
None
=
None
...
@@ -942,6 +943,9 @@ class EngineArgs:
...
@@ -942,6 +943,9 @@ class EngineArgs:
"--prefix-caching-hash-algo"
,
**
cache_kwargs
[
"prefix_caching_hash_algo"
]
"--prefix-caching-hash-algo"
,
**
cache_kwargs
[
"prefix_caching_hash_algo"
]
)
)
cache_group
.
add_argument
(
"--cpu-offload-gb"
,
**
cache_kwargs
[
"cpu_offload_gb"
])
cache_group
.
add_argument
(
"--cpu-offload-gb"
,
**
cache_kwargs
[
"cpu_offload_gb"
])
cache_group
.
add_argument
(
"--cpu-offload-params"
,
**
cache_kwargs
[
"cpu_offload_params"
]
)
cache_group
.
add_argument
(
cache_group
.
add_argument
(
"--calculate-kv-scales"
,
**
cache_kwargs
[
"calculate_kv_scales"
]
"--calculate-kv-scales"
,
**
cache_kwargs
[
"calculate_kv_scales"
]
)
)
...
@@ -1453,6 +1457,7 @@ class EngineArgs:
...
@@ -1453,6 +1457,7 @@ class EngineArgs:
enable_prefix_caching
=
self
.
enable_prefix_caching
,
enable_prefix_caching
=
self
.
enable_prefix_caching
,
prefix_caching_hash_algo
=
self
.
prefix_caching_hash_algo
,
prefix_caching_hash_algo
=
self
.
prefix_caching_hash_algo
,
cpu_offload_gb
=
self
.
cpu_offload_gb
,
cpu_offload_gb
=
self
.
cpu_offload_gb
,
cpu_offload_params
=
self
.
cpu_offload_params
,
calculate_kv_scales
=
self
.
calculate_kv_scales
,
calculate_kv_scales
=
self
.
calculate_kv_scales
,
kv_sharing_fast_prefill
=
self
.
kv_sharing_fast_prefill
,
kv_sharing_fast_prefill
=
self
.
kv_sharing_fast_prefill
,
mamba_cache_dtype
=
self
.
mamba_cache_dtype
,
mamba_cache_dtype
=
self
.
mamba_cache_dtype
,
...
...
vllm/model_executor/models/utils.py
View file @
b37b6797
...
@@ -31,6 +31,7 @@ from vllm.model_executor.models.interfaces import supports_any_eagle
...
@@ -31,6 +31,7 @@ from vllm.model_executor.models.interfaces import supports_any_eagle
from
vllm.multimodal
import
NestedTensors
from
vllm.multimodal
import
NestedTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils.math_utils
import
cdiv
from
vllm.utils.math_utils
import
cdiv
from
vllm.utils.mem_utils
import
format_gib
from
vllm.utils.platform_utils
import
(
from
vllm.utils.platform_utils
import
(
is_pin_memory_available
,
is_pin_memory_available
,
is_uva_available
,
is_uva_available
,
...
@@ -613,6 +614,7 @@ class PPMissingLayer(torch.nn.Identity):
...
@@ -613,6 +614,7 @@ class PPMissingLayer(torch.nn.Identity):
_CPU_OFFLOAD_BYTES
=
0
_CPU_OFFLOAD_BYTES
=
0
_CPU_OFFLOAD_MAX_BYTES
=
0
_CPU_OFFLOAD_MAX_BYTES
=
0
_CPU_OFFLOAD_PARAMS
=
set
()
def
set_cpu_offload_max_bytes
(
max_bytes
:
int
)
->
None
:
def
set_cpu_offload_max_bytes
(
max_bytes
:
int
)
->
None
:
...
@@ -621,6 +623,11 @@ def set_cpu_offload_max_bytes(max_bytes: int) -> None:
...
@@ -621,6 +623,11 @@ def set_cpu_offload_max_bytes(max_bytes: int) -> None:
_CPU_OFFLOAD_MAX_BYTES
=
max_bytes
_CPU_OFFLOAD_MAX_BYTES
=
max_bytes
def
set_cpu_offload_params
(
params
:
set
[
str
])
->
None
:
global
_CPU_OFFLOAD_PARAMS
_CPU_OFFLOAD_PARAMS
=
params
def
maybe_offload_to_cpu
(
module
:
torch
.
nn
.
Module
)
->
torch
.
nn
.
Module
:
def
maybe_offload_to_cpu
(
module
:
torch
.
nn
.
Module
)
->
torch
.
nn
.
Module
:
if
(
params
:
=
next
(
module
.
parameters
(),
None
))
is
None
:
if
(
params
:
=
next
(
module
.
parameters
(),
None
))
is
None
:
return
module
return
module
...
@@ -642,12 +649,23 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
...
@@ -642,12 +649,23 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
# offload parameters to CPU
# offload parameters to CPU
# use pin_memory if possible, which helps cudagraph capture speed
# use pin_memory if possible, which helps cudagraph capture speed
offloaded_parameters
=
False
offloaded_parameters
=
False
for
p
in
module
.
parameters
():
for
name
,
p
in
module
.
named_
parameters
():
if
_CPU_OFFLOAD_BYTES
>=
_CPU_OFFLOAD_MAX_BYTES
:
if
_CPU_OFFLOAD_BYTES
>=
_CPU_OFFLOAD_MAX_BYTES
:
# we use per-parameter offloading
# we use per-parameter offloading
# one module might have some parameters offloaded and some not
# one module might have some parameters offloaded and some not
break
break
if
_CPU_OFFLOAD_PARAMS
:
# Check if parameter belongs to the offloading set
# Add dots here to ensure we match full segments only
# e.g., "experts.w2_weight" matches "mlp.experts.w2_weight" but not
# "mlp.experts.w2_weight_scale"
should_offload
=
any
(
f
".
{
param
}
."
in
f
".
{
name
}
."
for
param
in
_CPU_OFFLOAD_PARAMS
)
if
not
should_offload
:
continue
cpu_data
=
p
.
data
.
to
(
device
=
"cpu"
)
cpu_data
=
p
.
data
.
to
(
device
=
"cpu"
)
if
pin_memory
:
if
pin_memory
:
cpu_data
=
cpu_data
.
pin_memory
()
cpu_data
=
cpu_data
.
pin_memory
()
...
@@ -708,6 +726,10 @@ def make_layers(
...
@@ -708,6 +726,10 @@ def make_layers(
]
]
+
[
PPMissingLayer
()
for
_
in
range
(
end_layer
,
num_hidden_layers
)]
+
[
PPMissingLayer
()
for
_
in
range
(
end_layer
,
num_hidden_layers
)]
)
)
if
_CPU_OFFLOAD_MAX_BYTES
>
0
:
logger
.
info
(
"Total CPU offloaded parameters: %s GBs"
,
format_gib
(
_CPU_OFFLOAD_BYTES
)
)
return
start_layer
,
end_layer
,
modules
return
start_layer
,
end_layer
,
modules
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
b37b6797
...
@@ -345,9 +345,13 @@ class GPUModelRunner(
...
@@ -345,9 +345,13 @@ class GPUModelRunner(
self
.
speculative_config
=
vllm_config
.
speculative_config
self
.
speculative_config
=
vllm_config
.
speculative_config
self
.
observability_config
=
vllm_config
.
observability_config
self
.
observability_config
=
vllm_config
.
observability_config
from
vllm.model_executor.models.utils
import
set_cpu_offload_max_bytes
from
vllm.model_executor.models.utils
import
(
set_cpu_offload_max_bytes
,
set_cpu_offload_params
,
)
set_cpu_offload_max_bytes
(
int
(
self
.
cache_config
.
cpu_offload_gb
*
1024
**
3
))
set_cpu_offload_max_bytes
(
int
(
self
.
cache_config
.
cpu_offload_gb
*
1024
**
3
))
set_cpu_offload_params
(
self
.
cache_config
.
cpu_offload_params
)
model_config
=
self
.
model_config
model_config
=
self
.
model_config
cache_config
=
self
.
cache_config
cache_config
=
self
.
cache_config
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment