Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3dc01ef3
Unverified
Commit
3dc01ef3
authored
Apr 01, 2026
by
Asaf Gardin
Committed by
GitHub
Mar 31, 2026
Browse files
[Quantization] Consolidate dummy format logic into DummyModelLoader (#38637)
Signed-off-by:
Josephasafg
<
ajgard7@gmail.com
>
parent
cc671cb1
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
42 additions
and
15 deletions
+42
-15
vllm/model_executor/model_loader/dummy_loader.py
vllm/model_executor/model_loader/dummy_loader.py
+40
-4
vllm/model_executor/model_loader/reload/layerwise.py
vllm/model_executor/model_loader/reload/layerwise.py
+2
-11
No files found.
vllm/model_executor/model_loader/dummy_loader.py
View file @
3dc01ef3
...
...
@@ -4,8 +4,19 @@ import torch.nn as nn
from
vllm.config
import
ModelConfig
from
vllm.config.load
import
LoadConfig
from
vllm.model_executor.layers.quantization.base_config
import
QuantizeMethodBase
from
vllm.model_executor.model_loader.base_loader
import
BaseModelLoader
from
vllm.model_executor.model_loader.weight_utils
import
initialize_dummy_weights
from
vllm.model_executor.model_loader.reload.layerwise
import
(
_get_original_loader
,
get_layerwise_info
,
)
from
vllm.model_executor.model_loader.reload.meta
import
materialize_layer
from
vllm.model_executor.model_loader.reload.types
import
LayerReloadingInfo
from
vllm.model_executor.model_loader.reload.utils
import
get_layer_tensors
from
vllm.model_executor.model_loader.weight_utils
import
(
initialize_dummy_weights
,
initialize_single_dummy_weight
,
)
class
DummyModelLoader
(
BaseModelLoader
):
...
...
@@ -23,6 +34,31 @@ class DummyModelLoader(BaseModelLoader):
pass
# Nothing to download
def
load_weights
(
self
,
model
:
nn
.
Module
,
model_config
:
ModelConfig
)
->
None
:
for
layer
in
model
.
modules
():
info
=
get_layerwise_info
(
layer
)
if
info
.
can_load
():
self
.
_process_online_quant_layer
(
layer
,
info
)
else
:
# NOTE(woosuk): For accurate performance evaluation, we assign
# random values to the weights.
initialize_dummy_weights
(
model
,
model_config
)
initialize_dummy_weights
(
layer
,
model_config
)
def
_process_online_quant_layer
(
self
,
layer
:
nn
.
Module
,
info
:
LayerReloadingInfo
,
)
->
None
:
"""Materialize, apply dummy weights, and run quantization processing."""
materialize_layer
(
layer
,
info
)
for
tensor
in
get_layer_tensors
(
layer
).
values
():
initialize_single_dummy_weight
(
tensor
)
for
param
in
get_layer_tensors
(
layer
).
values
():
param
.
weight_loader
=
_get_original_loader
(
param
)
quant_method
=
getattr
(
layer
,
"quant_method"
,
None
)
if
isinstance
(
quant_method
,
QuantizeMethodBase
):
quant_method
.
process_weights_after_loading
(
layer
)
info
.
reset
()
vllm/model_executor/model_loader/reload/layerwise.py
View file @
3dc01ef3
...
...
@@ -11,10 +11,7 @@ from vllm.config import ModelConfig
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.attention
import
Attention
,
MLAAttention
from
vllm.model_executor.layers.quantization.base_config
import
QuantizeMethodBase
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
initialize_single_dummy_weight
,
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
.meta
import
(
capture_layer_to_meta
,
...
...
@@ -224,7 +221,7 @@ def finalize_layerwise_processing(model: torch.nn.Module, model_config: ModelCon
# No weights were loaded
elif
info
.
load_numel
<=
0
:
# first load
but received no weights. This happens on dummy load
# first load
: checkpoint did not contain weights for this layer
if
info
.
kernel_tensors
is
None
:
_layerwise_process
(
layer
,
info
)
continue
...
...
@@ -262,12 +259,6 @@ def _layerwise_process(layer: torch.nn.Module, info: LayerReloadingInfo):
# Materialize layer tensors onto device
materialize_layer
(
layer
,
info
)
# If no weights were loaded (e.g. dummy loading), initialize with
# small random values to avoid NaN from zero/garbage data
if
len
(
info
.
loaded_weights
)
<=
0
:
for
tensor
in
get_layer_tensors
(
layer
).
values
():
initialize_single_dummy_weight
(
tensor
)
# Reset online quantization flag so process_weights_after_loading
# will run again during reload
if
hasattr
(
layer
,
"_already_called_process_weights_after_loading"
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment