Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
abf3db40
Unverified
Commit
abf3db40
authored
Oct 22, 2025
by
Jee Jee Li
Committed by
GitHub
Oct 22, 2025
Browse files
[Core] Handle MoE LoRA edge cases (#27335)
Signed-off-by:
Jee Jee Li
<
pandaleefree@gmail.com
>
parent
8e4ca4d1
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
7 additions
and
11 deletions
+7
-11
vllm/lora/layers/fused_moe.py
vllm/lora/layers/fused_moe.py
+7
-10
vllm/lora/models.py
vllm/lora/models.py
+0
-1
No files found.
vllm/lora/layers/fused_moe.py
View file @
abf3db40
...
@@ -74,7 +74,6 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
...
@@ -74,7 +74,6 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
moe_state_dict
[
"apply_router_weight_on_input"
]
=
kwargs
[
moe_state_dict
[
"apply_router_weight_on_input"
]
=
kwargs
[
"apply_router_weight_on_input"
"apply_router_weight_on_input"
]
]
moe_state_dict
[
"max_loras"
]
=
layer
.
w1_lora_a_stacked
.
shape
[
0
]
result
=
func
(
*
args
,
**
kwargs
)
result
=
func
(
*
args
,
**
kwargs
)
return
result
return
result
...
@@ -89,7 +88,6 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
...
@@ -89,7 +88,6 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
curr_topk_ids
=
moe_state_dict
[
"topk_ids"
]
curr_topk_ids
=
moe_state_dict
[
"topk_ids"
]
global_num_experts
=
moe_state_dict
[
"global_num_experts"
]
global_num_experts
=
moe_state_dict
[
"global_num_experts"
]
expert_map
=
moe_state_dict
[
"expert_map"
]
expert_map
=
moe_state_dict
[
"expert_map"
]
max_loras
=
moe_state_dict
[
"max_loras"
]
config_dtype
=
_get_config_dtype_str
(
config_dtype
=
_get_config_dtype_str
(
dtype
=
hidden_states
.
dtype
,
dtype
=
hidden_states
.
dtype
,
...
@@ -110,6 +108,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
...
@@ -110,6 +108,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
block_shape
=
layer
.
quant_method
.
moe_quant_config
.
block_shape
,
block_shape
=
layer
.
quant_method
.
moe_quant_config
.
block_shape
,
)
)
max_loras
=
self
.
w1_lora_a_stacked
.
shape
[
0
]
config
=
get_config_func
(
M
)
config
=
get_config_func
(
M
)
(
(
sorted_token_ids_lora
,
sorted_token_ids_lora
,
...
@@ -161,7 +160,6 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
...
@@ -161,7 +160,6 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
def
wrapper
(
*
args
,
**
kwargs
):
def
wrapper
(
*
args
,
**
kwargs
):
hidden_states
=
moe_state_dict
[
"hidden_states"
]
hidden_states
=
moe_state_dict
[
"hidden_states"
]
topk_weights
=
moe_state_dict
[
"topk_weights"
]
topk_weights
=
moe_state_dict
[
"topk_weights"
]
max_loras
=
moe_state_dict
[
"max_loras"
]
config_dtype
=
_get_config_dtype_str
(
config_dtype
=
_get_config_dtype_str
(
dtype
=
hidden_states
.
dtype
,
dtype
=
hidden_states
.
dtype
,
...
@@ -189,7 +187,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
...
@@ -189,7 +187,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
num_tokens_post_padded_lora
=
moe_state_dict
[
num_tokens_post_padded_lora
=
moe_state_dict
[
"num_tokens_post_padded_lora"
"num_tokens_post_padded_lora"
]
]
max_loras
=
self
.
w1_lora_a_stacked
.
shape
[
0
]
expert_ids_lora
=
expert_ids_lora
.
view
(
max_loras
,
-
1
)
expert_ids_lora
=
expert_ids_lora
.
view
(
max_loras
,
-
1
)
sorted_token_ids_lora
=
sorted_token_ids_lora
.
view
(
max_loras
,
-
1
)
sorted_token_ids_lora
=
sorted_token_ids_lora
.
view
(
max_loras
,
-
1
)
intermediate_cache2
=
moe_state_dict
[
"intermediate_cache2"
]
intermediate_cache2
=
moe_state_dict
[
"intermediate_cache2"
]
...
@@ -305,12 +303,6 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
...
@@ -305,12 +303,6 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
device
=
self
.
device
,
device
=
self
.
device
,
)
)
self
.
base_layer
.
w1_lora_a_stacked
=
self
.
w1_lora_a_stacked
self
.
base_layer
.
w1_lora_b_stacked
=
self
.
w1_lora_b_stacked
self
.
base_layer
.
w2_lora_a_stacked
=
self
.
w2_lora_a_stacked
self
.
base_layer
.
w2_lora_b_stacked
=
self
.
w2_lora_b_stacked
self
.
base_layer
.
w3_lora_a_stacked
=
self
.
w3_lora_a_stacked
self
.
base_layer
.
w3_lora_b_stacked
=
self
.
w3_lora_b_stacked
# They will be used by 'LoRALayerWeights.create_dummy_lora_weights'
# They will be used by 'LoRALayerWeights.create_dummy_lora_weights'
# to create a dummy LoRA weights.
# to create a dummy LoRA weights.
self
.
lora_a_stacked
=
[]
self
.
lora_a_stacked
=
[]
...
@@ -343,6 +335,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
...
@@ -343,6 +335,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
embeddings_tensor
:
torch
.
Tensor
|
None
,
embeddings_tensor
:
torch
.
Tensor
|
None
,
bias
:
torch
.
Tensor
|
None
=
None
,
bias
:
torch
.
Tensor
|
None
=
None
,
):
):
self
.
reset_lora
(
index
)
"""Overwrites lora tensors at index."""
"""Overwrites lora tensors at index."""
for
eid
in
range
(
len
(
lora_a
)
//
3
):
for
eid
in
range
(
len
(
lora_a
)
//
3
):
w1_lora_a
=
lora_a
[
eid
*
3
]
w1_lora_a
=
lora_a
[
eid
*
3
]
...
@@ -352,6 +345,10 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
...
@@ -352,6 +345,10 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
w2_lora_b
=
lora_b
[
eid
*
3
+
1
]
w2_lora_b
=
lora_b
[
eid
*
3
+
1
]
w3_lora_b
=
lora_b
[
eid
*
3
+
2
]
w3_lora_b
=
lora_b
[
eid
*
3
+
2
]
# Handle the case of adding LoRA to only a subset of experts
if
w1_lora_a
is
None
or
w2_lora_a
is
None
or
w3_lora_a
is
None
:
continue
if
self
.
tp_size
>
1
:
if
self
.
tp_size
>
1
:
shard_size
=
self
.
base_layer
.
intermediate_size_per_partition
shard_size
=
self
.
base_layer
.
intermediate_size_per_partition
start_idx
=
self
.
tp_rank
*
shard_size
start_idx
=
self
.
tp_rank
*
shard_size
...
...
vllm/lora/models.py
View file @
abf3db40
...
@@ -426,7 +426,6 @@ class LoRAModelManager:
...
@@ -426,7 +426,6 @@ class LoRAModelManager:
for
module_name
,
module
in
self
.
modules
.
items
():
for
module_name
,
module
in
self
.
modules
.
items
():
module_lora
=
self
.
_get_lora_layer_weights
(
lora_model
,
module_name
)
module_lora
=
self
.
_get_lora_layer_weights
(
lora_model
,
module_name
)
if
module_lora
:
if
module_lora
:
module_lora
.
optimize
()
# Note (gnovack) - If MOE lora weights are not split into
# Note (gnovack) - If MOE lora weights are not split into
# num_experts chunks, we split them here
# num_experts chunks, we split them here
if
isinstance
(
module
,
FusedMoEWithLoRA
)
and
torch
.
is_tensor
(
if
isinstance
(
module
,
FusedMoEWithLoRA
)
and
torch
.
is_tensor
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment