Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
26f07294
Unverified
Commit
26f07294
authored
Mar 26, 2025
by
fzyzcjy
Committed by
GitHub
Mar 26, 2025
Browse files
Warn users when release_memory_occupation is called without memory saver enabled (#4566)
parent
34e07a65
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
50 additions
and
12 deletions
+50
-12
.github/workflows/pr-test-amd.yml
.github/workflows/pr-test-amd.yml
+6
-6
.github/workflows/release-docs.yml
.github/workflows/release-docs.yml
+1
-1
python/pyproject.toml
python/pyproject.toml
+1
-1
python/sglang/srt/managers/scheduler.py
python/sglang/srt/managers/scheduler.py
+4
-0
python/sglang/srt/model_executor/model_runner.py
python/sglang/srt/model_executor/model_runner.py
+8
-1
python/sglang/srt/torch_memory_saver_adapter.py
python/sglang/srt/torch_memory_saver_adapter.py
+22
-0
python/sglang/test/attention/test_flashattn_backend.py
python/sglang/test/attention/test_flashattn_backend.py
+2
-1
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+5
-0
test/srt/test_mla_int8_deepseek_v3.py
test/srt/test_mla_int8_deepseek_v3.py
+1
-1
test/srt/test_vision_openai_server.py
test/srt/test_vision_openai_server.py
+0
-1
No files found.
.github/workflows/pr-test-amd.yml
View file @
26f07294
...
@@ -22,7 +22,7 @@ concurrency:
...
@@ -22,7 +22,7 @@ concurrency:
jobs
:
jobs
:
accuracy-test-1-gpu-amd
:
accuracy-test-1-gpu-amd
:
if
:
(github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
if
:
(github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft ==
false
github.event.pull_request.draft ==
false
runs-on
:
linux-mi300-gpu-1
runs-on
:
linux-mi300-gpu-1
steps
:
steps
:
-
name
:
Checkout code
-
name
:
Checkout code
...
@@ -56,13 +56,13 @@ jobs:
...
@@ -56,13 +56,13 @@ jobs:
-
name
:
Evaluate Accuracy
-
name
:
Evaluate Accuracy
timeout-minutes
:
20
timeout-minutes
:
20
run
:
|
run
:
|
docker exec -w /sglang-checkout/test/srt ci_sglang python3 test_eval_accuracy_large.py
docker exec -w /sglang-checkout/test/srt
-e SGLANG_IS_IN_CI=1
ci_sglang python3 test_eval_accuracy_large.py
docker exec -w /sglang-checkout/test/srt ci_sglang python3 test_eval_fp8_accuracy.py
docker exec -w /sglang-checkout/test/srt
-e SGLANG_IS_IN_CI=1
ci_sglang python3 test_eval_fp8_accuracy.py
docker exec -w /sglang-checkout/test/srt ci_sglang python3 models/test_qwen_models.py
docker exec -w /sglang-checkout/test/srt
-e SGLANG_IS_IN_CI=1
ci_sglang python3 models/test_qwen_models.py
mla-test-1-gpu-amd
:
mla-test-1-gpu-amd
:
if
:
(github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
if
:
(github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft ==
false
github.event.pull_request.draft ==
false
runs-on
:
linux-mi300-gpu-1
runs-on
:
linux-mi300-gpu-1
steps
:
steps
:
-
name
:
Checkout code
-
name
:
Checkout code
...
@@ -96,7 +96,7 @@ jobs:
...
@@ -96,7 +96,7 @@ jobs:
-
name
:
MLA TEST
-
name
:
MLA TEST
timeout-minutes
:
20
timeout-minutes
:
20
run
:
|
run
:
|
docker exec -w /sglang-checkout/test/srt ci_sglang python3 test_mla.py
docker exec -w /sglang-checkout/test/srt
-e SGLANG_IS_IN_CI=1
ci_sglang python3 test_mla.py
finish
:
finish
:
if
:
always()
if
:
always()
...
...
.github/workflows/release-docs.yml
View file @
26f07294
...
@@ -33,7 +33,7 @@ jobs:
...
@@ -33,7 +33,7 @@ jobs:
pip install -r docs/requirements.txt
pip install -r docs/requirements.txt
apt-get update
apt-get update
apt-get install -y pandoc
apt-get install -y pandoc
apt-get update && apt-get install -y parallel
apt-get update && apt-get install -y parallel
retry
-
name
:
Setup Jupyter Kernel
-
name
:
Setup Jupyter Kernel
run
:
|
run
:
|
...
...
python/pyproject.toml
View file @
26f07294
...
@@ -72,7 +72,7 @@ srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "torch"]
...
@@ -72,7 +72,7 @@ srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "torch"]
openai
=
[
"openai>=1.0"
,
"tiktoken"
]
openai
=
[
"openai>=1.0"
,
"tiktoken"
]
anthropic
=
["anthropic>=0.20.0"]
anthropic
=
["anthropic>=0.20.0"]
litellm
=
["litellm>=1.0.0"]
litellm
=
["litellm>=1.0.0"]
torch_memory_saver
=
["torch_memory_saver"]
torch_memory_saver
=
["torch_memory_saver
>=0.0.3
"]
test
=
[
test
=
[
"jsonlines"
,
"jsonlines"
,
"matplotlib"
,
"matplotlib"
,
...
...
python/sglang/srt/managers/scheduler.py
View file @
26f07294
...
@@ -1790,6 +1790,9 @@ class Scheduler(
...
@@ -1790,6 +1790,9 @@ class Scheduler(
return
GetWeightsByNameReqOutput
(
parameter
)
return
GetWeightsByNameReqOutput
(
parameter
)
def
release_memory_occupation
(
self
,
recv_req
:
ReleaseMemoryOccupationReqInput
):
def
release_memory_occupation
(
self
,
recv_req
:
ReleaseMemoryOccupationReqInput
):
self
.
memory_saver_adapter
.
check_validity
(
caller_name
=
"release_memory_occupation"
)
self
.
stashed_model_static_state
=
_export_static_state
(
self
.
stashed_model_static_state
=
_export_static_state
(
self
.
tp_worker
.
worker
.
model_runner
.
model
self
.
tp_worker
.
worker
.
model_runner
.
model
)
)
...
@@ -1798,6 +1801,7 @@ class Scheduler(
...
@@ -1798,6 +1801,7 @@ class Scheduler(
return
ReleaseMemoryOccupationReqOutput
()
return
ReleaseMemoryOccupationReqOutput
()
def
resume_memory_occupation
(
self
,
recv_req
:
ResumeMemoryOccupationReqInput
):
def
resume_memory_occupation
(
self
,
recv_req
:
ResumeMemoryOccupationReqInput
):
self
.
memory_saver_adapter
.
check_validity
(
caller_name
=
"resume_memory_occupation"
)
self
.
memory_saver_adapter
.
resume
()
self
.
memory_saver_adapter
.
resume
()
_import_static_state
(
_import_static_state
(
self
.
tp_worker
.
worker
.
model_runner
.
model
,
self
.
stashed_model_static_state
self
.
tp_worker
.
worker
.
model_runner
.
model
,
self
.
stashed_model_static_state
...
...
python/sglang/srt/model_executor/model_runner.py
View file @
26f07294
...
@@ -287,7 +287,14 @@ class ModelRunner:
...
@@ -287,7 +287,14 @@ class ModelRunner:
def
init_torch_distributed
(
self
):
def
init_torch_distributed
(
self
):
logger
.
info
(
"Init torch distributed begin."
)
logger
.
info
(
"Init torch distributed begin."
)
torch
.
get_device_module
(
self
.
device
).
set_device
(
self
.
gpu_id
)
try
:
torch
.
get_device_module
(
self
.
device
).
set_device
(
self
.
gpu_id
)
except
Exception
:
logger
.
warning
(
f
"Context:
{
self
.
device
=
}
{
self
.
gpu_id
=
}
{
os
.
environ
.
get
(
'CUDA_VISIBLE_DEVICES'
)
=
}
{
self
.
tp_rank
=
}
{
self
.
tp_size
=
}
"
)
raise
if
self
.
device
==
"cuda"
:
if
self
.
device
==
"cuda"
:
backend
=
"nccl"
backend
=
"nccl"
elif
self
.
device
==
"xpu"
:
elif
self
.
device
==
"xpu"
:
...
...
python/sglang/srt/torch_memory_saver_adapter.py
View file @
26f07294
import
logging
from
abc
import
ABC
from
abc
import
ABC
from
contextlib
import
contextmanager
from
contextlib
import
contextmanager
...
@@ -8,6 +9,8 @@ try:
...
@@ -8,6 +9,8 @@ try:
except
ImportError
:
except
ImportError
:
pass
pass
logger
=
logging
.
getLogger
(
__name__
)
class
TorchMemorySaverAdapter
(
ABC
):
class
TorchMemorySaverAdapter
(
ABC
):
@
staticmethod
@
staticmethod
...
@@ -16,6 +19,13 @@ class TorchMemorySaverAdapter(ABC):
...
@@ -16,6 +19,13 @@ class TorchMemorySaverAdapter(ABC):
_TorchMemorySaverAdapterReal
()
if
enable
else
_TorchMemorySaverAdapterNoop
()
_TorchMemorySaverAdapterReal
()
if
enable
else
_TorchMemorySaverAdapterNoop
()
)
)
def
check_validity
(
self
,
caller_name
):
if
not
self
.
enabled
:
logger
.
warning
(
f
"`
{
caller_name
}
` will not save memory because torch_memory_saver is not enabled. "
f
"Potential causes: `enable_memory_saver` is false, or torch_memory_saver has installation issues."
)
def
configure_subprocess
(
self
):
def
configure_subprocess
(
self
):
raise
NotImplementedError
raise
NotImplementedError
...
@@ -28,6 +38,10 @@ class TorchMemorySaverAdapter(ABC):
...
@@ -28,6 +38,10 @@ class TorchMemorySaverAdapter(ABC):
def
resume
(
self
):
def
resume
(
self
):
raise
NotImplementedError
raise
NotImplementedError
@
property
def
enabled
(
self
):
raise
NotImplementedError
class
_TorchMemorySaverAdapterReal
(
TorchMemorySaverAdapter
):
class
_TorchMemorySaverAdapterReal
(
TorchMemorySaverAdapter
):
def
configure_subprocess
(
self
):
def
configure_subprocess
(
self
):
...
@@ -42,6 +56,10 @@ class _TorchMemorySaverAdapterReal(TorchMemorySaverAdapter):
...
@@ -42,6 +56,10 @@ class _TorchMemorySaverAdapterReal(TorchMemorySaverAdapter):
def
resume
(
self
):
def
resume
(
self
):
return
_primary_memory_saver
.
resume
()
return
_primary_memory_saver
.
resume
()
@
property
def
enabled
(
self
):
return
_primary_memory_saver
.
enabled
class
_TorchMemorySaverAdapterNoop
(
TorchMemorySaverAdapter
):
class
_TorchMemorySaverAdapterNoop
(
TorchMemorySaverAdapter
):
@
contextmanager
@
contextmanager
...
@@ -57,3 +75,7 @@ class _TorchMemorySaverAdapterNoop(TorchMemorySaverAdapter):
...
@@ -57,3 +75,7 @@ class _TorchMemorySaverAdapterNoop(TorchMemorySaverAdapter):
def
resume
(
self
):
def
resume
(
self
):
pass
pass
@
property
def
enabled
(
self
):
return
False
python/sglang/test/attention/test_flashattn_backend.py
View file @
26f07294
...
@@ -6,6 +6,7 @@ from sglang.srt.layers.attention.flashattention_backend import FlashAttentionBac
...
@@ -6,6 +6,7 @@ from sglang.srt.layers.attention.flashattention_backend import FlashAttentionBac
from
sglang.srt.layers.radix_attention
import
RadixAttention
from
sglang.srt.layers.radix_attention
import
RadixAttention
from
sglang.srt.mem_cache.memory_pool
import
MHATokenToKVPool
from
sglang.srt.mem_cache.memory_pool
import
MHATokenToKVPool
from
sglang.srt.model_executor.forward_batch_info
import
ForwardBatch
,
ForwardMode
from
sglang.srt.model_executor.forward_batch_info
import
ForwardBatch
,
ForwardMode
from
sglang.test.test_utils
import
CustomTestCase
class
MockModelRunner
:
class
MockModelRunner
:
...
@@ -39,7 +40,7 @@ class MockReqToTokenPool:
...
@@ -39,7 +40,7 @@ class MockReqToTokenPool:
@
unittest
.
skipIf
(
not
torch
.
cuda
.
is_available
(),
"Test requires CUDA"
)
@
unittest
.
skipIf
(
not
torch
.
cuda
.
is_available
(),
"Test requires CUDA"
)
class
TestFlashAttentionBackend
(
unittest
.
TestCase
):
class
TestFlashAttentionBackend
(
Custom
TestCase
):
def
setUp
(
self
):
def
setUp
(
self
):
"""Set up test fixtures before each test method."""
"""Set up test fixtures before each test method."""
self
.
model_runner
=
MockModelRunner
()
self
.
model_runner
=
MockModelRunner
()
...
...
python/sglang/test/test_utils.py
View file @
26f07294
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
import
argparse
import
argparse
import
asyncio
import
asyncio
import
copy
import
copy
import
logging
import
os
import
os
import
random
import
random
import
subprocess
import
subprocess
...
@@ -922,6 +923,10 @@ def run_mulit_request_test(
...
@@ -922,6 +923,10 @@ def run_mulit_request_test(
def
write_github_step_summary
(
content
):
def
write_github_step_summary
(
content
):
if
not
os
.
environ
.
get
(
"GITHUB_STEP_SUMMARY"
):
logging
.
warning
(
"GITHUB_STEP_SUMMARY environment variable not set"
)
return
with
open
(
os
.
environ
[
"GITHUB_STEP_SUMMARY"
],
"a"
)
as
f
:
with
open
(
os
.
environ
[
"GITHUB_STEP_SUMMARY"
],
"a"
)
as
f
:
f
.
write
(
content
)
f
.
write
(
content
)
...
...
test/srt/test_mla_int8_deepseek_v3.py
View file @
26f07294
...
@@ -46,7 +46,7 @@ class TestMLADeepseekV3ChannelInt8(CustomTestCase):
...
@@ -46,7 +46,7 @@ class TestMLADeepseekV3ChannelInt8(CustomTestCase):
metrics
=
run_eval_few_shot_gsm8k
(
args
)
metrics
=
run_eval_few_shot_gsm8k
(
args
)
print
(
metrics
)
print
(
metrics
)
self
.
assertGreater
(
metrics
[
"accuracy"
],
0.6
2
)
self
.
assertGreater
Equal
(
metrics
[
"accuracy"
],
0.6
1
)
class
TestDeepseekV3MTPChannelInt8
(
CustomTestCase
):
class
TestDeepseekV3MTPChannelInt8
(
CustomTestCase
):
...
...
test/srt/test_vision_openai_server.py
View file @
26f07294
...
@@ -624,7 +624,6 @@ class TestMinicpmoServer(TestOpenAIVisionServer):
...
@@ -624,7 +624,6 @@ class TestMinicpmoServer(TestOpenAIVisionServer):
"minicpmo"
,
"minicpmo"
,
"--mem-fraction-static"
,
"--mem-fraction-static"
,
"0.7"
,
"0.7"
,
"--tp=2"
,
],
],
)
)
cls
.
base_url
+=
"/v1"
cls
.
base_url
+=
"/v1"
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment