Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
dccf52f9
Unverified
Commit
dccf52f9
authored
Sep 10, 2025
by
ryang
Committed by
GitHub
Sep 09, 2025
Browse files
[UT for RL] Add UT to cover release/resume memory case for moe model (#8803)
parent
676a7b51
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
79 additions
and
5 deletions
+79
-5
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+2
-1
test/srt/test_expert_distribution.py
test/srt/test_expert_distribution.py
+1
-1
test/srt/test_release_memory_occupation.py
test/srt/test_release_memory_occupation.py
+74
-1
test/srt/test_torch_compile_moe.py
test/srt/test_torch_compile_moe.py
+2
-2
No files found.
python/sglang/test/test_utils.py
View file @
dccf52f9
...
@@ -42,7 +42,8 @@ DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
...
@@ -42,7 +42,8 @@ DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
DEFAULT_SMALL_MODEL_NAME_FOR_TEST
=
"meta-llama/Llama-3.2-1B-Instruct"
DEFAULT_SMALL_MODEL_NAME_FOR_TEST
=
"meta-llama/Llama-3.2-1B-Instruct"
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE
=
"meta-llama/Llama-3.2-1B"
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE
=
"meta-llama/Llama-3.2-1B"
DEFAULT_MOE_MODEL_NAME_FOR_TEST
=
"mistralai/Mixtral-8x7B-Instruct-v0.1"
DEFAULT_MOE_MODEL_NAME_FOR_TEST
=
"mistralai/Mixtral-8x7B-Instruct-v0.1"
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST
=
"Qwen/Qwen1.5-MoE-A2.7B"
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE
=
"Qwen/Qwen1.5-MoE-A2.7B"
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT
=
"Qwen/Qwen1.5-MoE-A2.7B-Chat"
# MLA test models
# MLA test models
DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST
=
"Alibaba-NLP/gte-Qwen2-1.5B-instruct"
DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST
=
"Alibaba-NLP/gte-Qwen2-1.5B-instruct"
...
...
test/srt/test_expert_distribution.py
View file @
dccf52f9
...
@@ -8,7 +8,7 @@ import torch
...
@@ -8,7 +8,7 @@ import torch
from
sglang.srt.utils
import
kill_process_tree
from
sglang.srt.utils
import
kill_process_tree
from
sglang.test.test_utils
import
(
from
sglang.test.test_utils
import
(
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST
,
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST
_BASE
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
DEFAULT_URL_FOR_TEST
,
CustomTestCase
,
CustomTestCase
,
...
...
test/srt/test_release_memory_occupation.py
View file @
dccf52f9
...
@@ -38,6 +38,8 @@ from sglang.srt.constants import GPU_MEMORY_TYPE_KV_CACHE, GPU_MEMORY_TYPE_WEIGH
...
@@ -38,6 +38,8 @@ from sglang.srt.constants import GPU_MEMORY_TYPE_KV_CACHE, GPU_MEMORY_TYPE_WEIGH
from
sglang.test.test_utils
import
(
from
sglang.test.test_utils
import
(
DEFAULT_SMALL_MODEL_NAME_FOR_TEST
,
DEFAULT_SMALL_MODEL_NAME_FOR_TEST
,
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE
,
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE
,
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE
,
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT
,
CustomTestCase
,
CustomTestCase
,
)
)
...
@@ -50,7 +52,7 @@ def get_gpu_memory_gb():
...
@@ -50,7 +52,7 @@ def get_gpu_memory_gb():
class
TestReleaseMemoryOccupation
(
CustomTestCase
):
class
TestReleaseMemoryOccupation
(
CustomTestCase
):
def
_setup_engine
(
self
,
model_name
,
mem_fraction_static
=
0.8
,
tp_size
=
1
):
def
_setup_engine
(
self
,
model_name
,
mem_fraction_static
=
0.8
,
tp_size
=
1
,
ep_size
=
1
):
"""Common setup for engine and HF model."""
"""Common setup for engine and HF model."""
engine
=
sgl
.
Engine
(
engine
=
sgl
.
Engine
(
model_path
=
model_name
,
model_path
=
model_name
,
...
@@ -58,6 +60,7 @@ class TestReleaseMemoryOccupation(CustomTestCase):
...
@@ -58,6 +60,7 @@ class TestReleaseMemoryOccupation(CustomTestCase):
enable_memory_saver
=
True
,
enable_memory_saver
=
True
,
mem_fraction_static
=
mem_fraction_static
,
mem_fraction_static
=
mem_fraction_static
,
tp_size
=
tp_size
,
tp_size
=
tp_size
,
ep_size
=
ep_size
,
# disable_cuda_graph=True, # for debugging only
# disable_cuda_graph=True, # for debugging only
)
)
...
@@ -70,6 +73,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
...
@@ -70,6 +73,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
"sampling_params"
:
{
"temperature"
:
0
,
"max_new_tokens"
:
8
},
"sampling_params"
:
{
"temperature"
:
0
,
"max_new_tokens"
:
8
},
"expect_output_before_update_weights"
:
" to spend it outdoors. I decided to"
,
"expect_output_before_update_weights"
:
" to spend it outdoors. I decided to"
,
"expect_output_after_update_weights"
:
" to go for a walk. I like"
,
"expect_output_after_update_weights"
:
" to go for a walk. I like"
,
"prompt_moe"
:
"The weather is nice today, and I want to"
,
"sampling_params_moe"
:
{
"temperature"
:
0
,
"max_new_tokens"
:
16
},
"expect_output_before_update_weights_moe"
:
" go to the park. I have a picnic basket, a book, and a"
,
"expect_output_after_update_weights_moe"
:
" go to the park. I have a lot of things to do, but I"
,
}
}
def
_test_initial_generation
(
def
_test_initial_generation
(
...
@@ -250,6 +257,72 @@ class TestReleaseMemoryOccupation(CustomTestCase):
...
@@ -250,6 +257,72 @@ class TestReleaseMemoryOccupation(CustomTestCase):
self
.
assertEqual
(
outputs
,
params
[
"expect_output_after_update_weights"
])
self
.
assertEqual
(
outputs
,
params
[
"expect_output_after_update_weights"
])
engine
.
shutdown
()
engine
.
shutdown
()
def
test_moe_model_release_and_resume
(
self
):
# Test with MoE model
model_name
=
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT
tp_size
=
ep_size
=
2
print
(
f
"Testing tp_size=
{
tp_size
}
and ep_size=
{
ep_size
}
for test_moe_model_release_and_resume"
)
engine
=
sgl
.
Engine
(
model_path
=
model_name
,
random_seed
=
42
,
enable_memory_saver
=
True
,
mem_fraction_static
=
0.5
,
tp_size
=
tp_size
,
ep_size
=
ep_size
,
)
params
=
self
.
_common_test_params
()
self
.
_test_initial_generation
(
engine
,
params
[
"prompt_moe"
],
params
[
"sampling_params_moe"
],
params
[
"expect_output_before_update_weights_moe"
],
)
t
=
time
.
perf_counter
()
gpu_memory_usage_before_release
=
get_gpu_memory_gb
()
engine
.
release_memory_occupation
()
gpu_memory_usage_after_release
=
get_gpu_memory_gb
()
self
.
assertLess
(
gpu_memory_usage_after_release
,
gpu_memory_usage_before_release
,
)
print
(
f
"Release took
{
time
.
perf_counter
()
-
t
:.
2
f
}
s, memory:
{
gpu_memory_usage_before_release
:.
1
f
}
GB →
{
gpu_memory_usage_after_release
:.
1
f
}
GB"
)
if
_DEBUG_EXTRA
:
time
.
sleep
(
3
)
t
=
time
.
perf_counter
()
engine
.
resume_memory_occupation
()
print
(
f
"Resume took
{
time
.
perf_counter
()
-
t
:.
2
f
}
s, memory:
{
get_gpu_memory_gb
():.
1
f
}
GB"
)
hf_model_new
=
AutoModelForCausalLM
.
from_pretrained
(
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE
,
torch_dtype
=
"bfloat16"
,
device_map
=
"cuda"
,
)
engine
.
update_weights_from_tensor
(
list
(
hf_model_new
.
named_parameters
()))
# destroy the hf model
del
hf_model_new
torch
.
cuda
.
empty_cache
()
print
(
"generate (#2)"
)
outputs
=
engine
.
generate
(
params
[
"prompt_moe"
],
params
[
"sampling_params_moe"
])[
"text"
]
self
.
assertEqual
(
outputs
,
params
[
"expect_output_after_update_weights_moe"
])
engine
.
shutdown
()
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
test/srt/test_torch_compile_moe.py
View file @
dccf52f9
...
@@ -7,7 +7,7 @@ import requests
...
@@ -7,7 +7,7 @@ import requests
from
sglang.srt.utils
import
is_cuda
,
kill_process_tree
from
sglang.srt.utils
import
is_cuda
,
kill_process_tree
from
sglang.test.run_eval
import
run_eval
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
from
sglang.test.test_utils
import
(
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST
,
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST
_BASE
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
DEFAULT_URL_FOR_TEST
,
CustomTestCase
,
CustomTestCase
,
...
@@ -18,7 +18,7 @@ from sglang.test.test_utils import (
...
@@ -18,7 +18,7 @@ from sglang.test.test_utils import (
class
TestTorchCompileMoe
(
CustomTestCase
):
class
TestTorchCompileMoe
(
CustomTestCase
):
@
classmethod
@
classmethod
def
setUpClass
(
cls
):
def
setUpClass
(
cls
):
cls
.
model
=
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST
cls
.
model
=
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST
_BASE
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
process
=
popen_launch_server
(
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
model
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment