Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
80433e22
Unverified
Commit
80433e22
authored
Dec 08, 2025
by
Jee Jee Li
Committed by
GitHub
Dec 08, 2025
Browse files
[LoRA] Reduce the loading time of MoE LoRA (#30243)
Signed-off-by:
Jee Jee Li
<
pandaleefree@gmail.com
>
parent
5c2433a6
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
27 additions
and
7 deletions
+27
-7
vllm/lora/models.py
vllm/lora/models.py
+27
-7
No files found.
vllm/lora/models.py
View file @
80433e22
...
...
@@ -115,7 +115,7 @@ class LoRAModel:
weights_mapper
:
WeightsMapper
|
None
=
None
,
)
->
"LoRAModel"
:
"""Create a LoRAModel from a dictionary of tensors."""
pin_memory
=
str
(
device
)
==
"cpu"
and
is_pin_memory_available
()
loras
:
dict
[
str
,
LoRALayerWeights
]
=
{}
for
tensor_name
,
tensor
in
tensors
.
items
():
if
is_base_embeddding_weights
(
tensor_name
):
...
...
@@ -139,14 +139,8 @@ class LoRAModel:
f
" with the base model's vocabulary size(
{
model_vocab_size
}
)."
)
loras
[
module_name
].
lora_a
=
tensor
.
to
(
device
=
device
,
dtype
=
dtype
)
if
pin_memory
:
loras
[
module_name
].
lora_a
=
loras
[
module_name
].
lora_a
.
pin_memory
()
else
:
loras
[
module_name
].
lora_b
=
tensor
.
to
(
device
=
device
,
dtype
=
dtype
)
if
pin_memory
:
loras
[
module_name
].
lora_b
=
loras
[
module_name
].
lora_b
.
pin_memory
()
return
cls
(
lora_model_id
,
peft_helper
.
r
,
loras
)
@
classmethod
...
...
@@ -742,6 +736,32 @@ class LoRAModelManager:
for
lora
in
lora_model
.
loras
.
values
():
lora
.
optimize
()
first_lora
:
LoRALayerWeights
=
next
(
iter
(
lora_model
.
loras
.
values
()))
assert
first_lora
.
lora_a
is
not
None
if
isinstance
(
first_lora
.
lora_a
,
list
):
lora_device
=
next
(
iter
(
first_lora
.
lora_a
))
else
:
lora_device
=
first_lora
.
lora_a
.
device
# Execute pin_memory after LoRA weight merging, mainly because:
# 1. Some MoE models have a large number of LoRA weights. If we
# perform # pin_memory immediately after loading weights, the
# overhead is significant.
# 2. The weight packing above (e.g., pack_moe) may invalidate the
# pin_memory allocation, so we execute it after packing.
pin_memory
=
str
(
lora_device
)
==
"cpu"
and
is_pin_memory_available
()
if
pin_memory
:
for
lora
in
lora_model
.
loras
.
values
():
if
isinstance
(
lora
.
lora_a
,
list
):
for
index
in
range
(
len
(
lora
.
lora_a
)):
if
lora
.
lora_a
[
index
]
is
None
:
continue
lora
.
lora_a
[
index
]
=
lora
.
lora_a
[
index
].
pin_memory
()
lora
.
lora_b
[
index
]
=
lora
.
lora_b
[
index
].
pin_memory
()
else
:
lora
.
lora_a
=
lora
.
lora_a
.
pin_memory
()
lora
.
lora_b
=
lora
.
lora_b
.
pin_memory
()
def
_get_lora_layer_weights
(
self
,
lora_model
:
LoRAModel
,
module_name
:
str
)
->
LoRALayerWeights
|
None
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment