Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
9394ed63
Unverified
Commit
9394ed63
authored
Aug 13, 2025
by
fzyzcjy
Committed by
GitHub
Aug 13, 2025
Browse files
Fix gpt-oss ~2x memory consumption issue (#9146)
parent
930fe467
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
19 additions
and
7 deletions
+19
-7
python/sglang/srt/models/gpt_oss.py
python/sglang/srt/models/gpt_oss.py
+19
-7
No files found.
python/sglang/srt/models/gpt_oss.py
View file @
9394ed63
...
@@ -64,7 +64,13 @@ from sglang.srt.layers.vocab_parallel_embedding import (
...
@@ -64,7 +64,13 @@ from sglang.srt.layers.vocab_parallel_embedding import (
from
sglang.srt.managers.schedule_batch
import
global_server_args_dict
from
sglang.srt.managers.schedule_batch
import
global_server_args_dict
from
sglang.srt.model_executor.forward_batch_info
import
ForwardBatch
,
PPProxyTensors
from
sglang.srt.model_executor.forward_batch_info
import
ForwardBatch
,
PPProxyTensors
from
sglang.srt.model_loader.weight_utils
import
default_weight_loader
from
sglang.srt.model_loader.weight_utils
import
default_weight_loader
from
sglang.srt.utils
import
add_prefix
,
is_cuda
,
is_flashinfer_available
,
make_layers
from
sglang.srt.utils
import
(
LazyValue
,
add_prefix
,
is_cuda
,
is_flashinfer_available
,
make_layers
,
)
_is_cuda
=
is_cuda
()
_is_cuda
=
is_cuda
()
_is_flashinfer_available
=
is_flashinfer_available
()
_is_flashinfer_available
=
is_flashinfer_available
()
...
@@ -655,6 +661,18 @@ class GptOssForCausalLM(nn.Module):
...
@@ -655,6 +661,18 @@ class GptOssForCausalLM(nn.Module):
self
.
logits_processor
=
LogitsProcessor
(
config
)
self
.
logits_processor
=
LogitsProcessor
(
config
)
self
.
capture_aux_hidden_states
=
False
self
.
capture_aux_hidden_states
=
False
self
.
_routed_experts_weights_of_layer
=
LazyValue
(
lambda
:
{
layer_id
:
self
.
model
.
layers
[
layer_id
].
mlp
.
get_moe_weights
()
for
layer_id
in
range
(
self
.
start_layer
,
self
.
end_layer
)
if
isinstance
(
self
.
model
.
layers
[
layer_id
].
mlp
,
GptOssSparseMoeBlock
)
}
)
@
property
def
routed_experts_weights_of_layer
(
self
):
return
self
.
_routed_experts_weights_of_layer
.
value
@
torch
.
no_grad
()
@
torch
.
no_grad
()
def
forward
(
def
forward
(
self
,
self
,
...
@@ -1138,12 +1156,6 @@ class GptOssForCausalLM(nn.Module):
...
@@ -1138,12 +1156,6 @@ class GptOssForCausalLM(nn.Module):
else
:
else
:
logging
.
info
(
"All parameters loaded successfully."
)
logging
.
info
(
"All parameters loaded successfully."
)
self
.
routed_experts_weights_of_layer
=
{
layer_id
:
self
.
model
.
layers
[
layer_id
].
mlp
.
get_moe_weights
()
for
layer_id
in
range
(
self
.
start_layer
,
self
.
end_layer
)
if
isinstance
(
self
.
model
.
layers
[
layer_id
].
mlp
,
GptOssSparseMoeBlock
)
}
def
get_embed_and_head
(
self
):
def
get_embed_and_head
(
self
):
return
self
.
model
.
embed_tokens
.
weight
,
self
.
lm_head
.
weight
return
self
.
model
.
embed_tokens
.
weight
,
self
.
lm_head
.
weight
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment