Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
2e8c4447
"lib/bindings/vscode:/vscode.git/clone" did not exist on "3c500ae7a9e8b8bc9dae8b558342eec79dc86106"
Unverified
Commit
2e8c4447
authored
Jan 20, 2026
by
Schwinn Saereesitthipitak
Committed by
GitHub
Jan 20, 2026
Browse files
refactor: clean up SGLang sleep/wake implementation (#5517)
parent
04cecda7
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
41 additions
and
74 deletions
+41
-74
components/src/dynamo/sglang/main.py
components/src/dynamo/sglang/main.py
+2
-56
components/src/dynamo/sglang/request_handlers/handler_base.py
...onents/src/dynamo/sglang/request_handlers/handler_base.py
+39
-18
No files found.
components/src/dynamo/sglang/main.py
View file @
2e8c4447
...
@@ -124,23 +124,6 @@ async def init(runtime: DistributedRuntime, config: Config):
...
@@ -124,23 +124,6 @@ async def init(runtime: DistributedRuntime, config: Config):
await
_handle_non_leader_node
(
engine
,
generate_endpoint
)
await
_handle_non_leader_node
(
engine
,
generate_endpoint
)
return
return
# Register engine routes for profiling
async
def
start_profile_handler
(
body
:
dict
)
->
dict
:
"""Handle /engine/start_profile requests"""
await
engine
.
tokenizer_manager
.
start_profile
(
**
body
)
return
{
"status"
:
"ok"
,
"message"
:
"Profiling started"
}
async
def
stop_profile_handler
(
body
:
dict
)
->
dict
:
"""Handle /engine/stop_profile requests"""
await
engine
.
tokenizer_manager
.
stop_profile
()
return
{
"status"
:
"ok"
,
"message"
:
"Profiling stopped"
}
runtime
.
register_engine_route
(
"start_profile"
,
start_profile_handler
)
runtime
.
register_engine_route
(
"stop_profile"
,
stop_profile_handler
)
logging
.
info
(
"Registered engine routes: /engine/start_profile, /engine/stop_profile"
)
# publisher instantiates the metrics and kv event publishers
# publisher instantiates the metrics and kv event publishers
publisher
,
metrics_task
,
metrics_labels
=
await
setup_sgl_metrics
(
publisher
,
metrics_task
,
metrics_labels
=
await
setup_sgl_metrics
(
engine
,
config
,
component
,
generate_endpoint
engine
,
config
,
component
,
generate_endpoint
...
@@ -156,17 +139,7 @@ async def init(runtime: DistributedRuntime, config: Config):
...
@@ -156,17 +139,7 @@ async def init(runtime: DistributedRuntime, config: Config):
handler
=
DecodeWorkerHandler
(
handler
=
DecodeWorkerHandler
(
component
,
engine
,
config
,
publisher
,
generate_endpoint
component
,
engine
,
config
,
publisher
,
generate_endpoint
)
)
handler
.
register_engine_routes
(
runtime
)
# Register memory management routes using handler methods
runtime
.
register_engine_route
(
"release_memory_occupation"
,
handler
.
release_memory_occupation
)
runtime
.
register_engine_route
(
"resume_memory_occupation"
,
handler
.
resume_memory_occupation
)
logging
.
info
(
"Registered engine routes: /engine/release_memory_occupation, /engine/resume_memory_occupation"
)
print
(
f
"Config:
{
config
}
"
)
print
(
f
"Config:
{
config
}
"
)
health_check_payload
=
SglangHealthCheckPayload
(
health_check_payload
=
SglangHealthCheckPayload
(
...
@@ -238,23 +211,6 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
...
@@ -238,23 +211,6 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
await
_handle_non_leader_node
(
engine
,
generate_endpoint
)
await
_handle_non_leader_node
(
engine
,
generate_endpoint
)
return
return
# Register engine routes for profiling
async
def
start_profile_handler
(
body
:
dict
)
->
dict
:
"""Handle /engine/start_profile requests"""
await
engine
.
tokenizer_manager
.
start_profile
(
**
body
)
return
{
"status"
:
"ok"
,
"message"
:
"Profiling started"
}
async
def
stop_profile_handler
(
body
:
dict
)
->
dict
:
"""Handle /engine/stop_profile requests"""
await
engine
.
tokenizer_manager
.
stop_profile
()
return
{
"status"
:
"ok"
,
"message"
:
"Profiling stopped"
}
runtime
.
register_engine_route
(
"start_profile"
,
start_profile_handler
)
runtime
.
register_engine_route
(
"stop_profile"
,
stop_profile_handler
)
logging
.
info
(
"Registered engine routes: /engine/start_profile, /engine/stop_profile"
)
# Perform dummy warmup for prefill worker to avoid initial TTFT hit
# Perform dummy warmup for prefill worker to avoid initial TTFT hit
# Only needed on leader node that handles requests
# Only needed on leader node that handles requests
await
_warmup_prefill_engine
(
engine
,
server_args
)
await
_warmup_prefill_engine
(
engine
,
server_args
)
...
@@ -271,17 +227,7 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
...
@@ -271,17 +227,7 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
handler
=
PrefillWorkerHandler
(
handler
=
PrefillWorkerHandler
(
component
,
engine
,
config
,
publisher
,
generate_endpoint
component
,
engine
,
config
,
publisher
,
generate_endpoint
)
)
handler
.
register_engine_routes
(
runtime
)
# Register memory management routes using handler methods
runtime
.
register_engine_route
(
"release_memory_occupation"
,
handler
.
release_memory_occupation
)
runtime
.
register_engine_route
(
"resume_memory_occupation"
,
handler
.
resume_memory_occupation
)
logging
.
info
(
"Registered engine routes: /engine/release_memory_occupation, /engine/resume_memory_occupation"
)
health_check_payload
=
SglangPrefillHealthCheckPayload
(
engine
).
to_dict
()
health_check_payload
=
SglangPrefillHealthCheckPayload
(
engine
).
to_dict
()
...
...
components/src/dynamo/sglang/request_handlers/handler_base.py
View file @
2e8c4447
...
@@ -17,8 +17,6 @@ from dynamo.common.utils.input_params import InputParamManager
...
@@ -17,8 +17,6 @@ from dynamo.common.utils.input_params import InputParamManager
from
dynamo.sglang.args
import
Config
from
dynamo.sglang.args
import
Config
from
dynamo.sglang.publisher
import
DynamoSglangPublisher
from
dynamo.sglang.publisher
import
DynamoSglangPublisher
logger
=
logging
.
getLogger
(
__name__
)
class
BaseWorkerHandler
(
ABC
):
class
BaseWorkerHandler
(
ABC
):
"""Abstract base class for SGLang worker handlers."""
"""Abstract base class for SGLang worker handlers."""
...
@@ -80,28 +78,23 @@ class BaseWorkerHandler(ABC):
...
@@ -80,28 +78,23 @@ class BaseWorkerHandler(ABC):
# Step 1: Unregister endpoint from discovery FIRST
# Step 1: Unregister endpoint from discovery FIRST
try
:
try
:
await
self
.
generate_endpoint
.
unregister_endpoint_instance
()
await
self
.
generate_endpoint
.
unregister_endpoint_instance
()
logger
.
info
(
"[ReleaseMemory] Unregistered endpoint from discovery - worker removed from routing pool"
)
except
Exception
as
unreg_err
:
except
Exception
as
unreg_err
:
logg
er
.
warning
(
logg
ing
.
warning
(
f
"
[ReleaseMemory]
Failed to unregister endpoint from discovery:
{
unreg_err
}
"
f
"Failed to unregister endpoint from discovery:
{
unreg_err
}
"
)
)
# Step 2: Pause generation to drain in-flight requests
# Step 2: Pause generation to drain in-flight requests
await
self
.
engine
.
async_pause_generation
()
await
self
.
engine
.
async_pause_generation
()
logger
.
info
(
"[ReleaseMemory] Generation paused"
)
# Step 3: Release memory now that it's safe
# Step 3: Release memory now that it's safe
await
self
.
engine
.
async_release_memory_occupation
(
tags
)
await
self
.
engine
.
async_release_memory_occupation
(
tags
)
logger
.
info
(
f
"[ReleaseMemory] Released memory for tags:
{
tags
}
"
)
return
{
return
{
"status"
:
"ok"
,
"status"
:
"ok"
,
"message"
:
f
"Memory released for tags:
{
tags
}
"
,
"message"
:
f
"Memory released for tags:
{
tags
}
"
,
}
}
except
Exception
as
e
:
except
Exception
as
e
:
logg
er
.
error
(
f
"Failed to release memory occupation:
{
e
}
"
)
logg
ing
.
error
(
f
"Failed to release memory occupation:
{
e
}
"
)
return
{
"status"
:
"error"
,
"message"
:
str
(
e
)}
return
{
"status"
:
"error"
,
"message"
:
str
(
e
)}
async
def
resume_memory_occupation
(
self
,
body
:
dict
)
->
dict
:
async
def
resume_memory_occupation
(
self
,
body
:
dict
)
->
dict
:
...
@@ -123,21 +116,16 @@ class BaseWorkerHandler(ABC):
...
@@ -123,21 +116,16 @@ class BaseWorkerHandler(ABC):
try
:
try
:
# Step 1: Resume memory first - must be ready before accepting requests
# Step 1: Resume memory first - must be ready before accepting requests
await
self
.
engine
.
async_resume_memory_occupation
(
tags
)
await
self
.
engine
.
async_resume_memory_occupation
(
tags
)
logger
.
info
(
f
"[ResumeMemory] Resumed memory for tags:
{
tags
}
"
)
# Step 2: Continue generation
# Step 2: Continue generation
await
self
.
engine
.
async_continue_generation
()
await
self
.
engine
.
async_continue_generation
()
logger
.
info
(
"[ResumeMemory] Generation continued"
)
# Step 3: Re-register to discovery so frontend can route to us
# Step 3: Re-register to discovery so frontend can route to us
try
:
try
:
await
self
.
generate_endpoint
.
register_endpoint_instance
()
await
self
.
generate_endpoint
.
register_endpoint_instance
()
logger
.
info
(
"[ResumeMemory] Re-registered endpoint to discovery - worker added back to routing pool"
)
except
Exception
as
reg_err
:
except
Exception
as
reg_err
:
logg
er
.
warning
(
logg
ing
.
warning
(
f
"
[ResumeMemory]
Failed to re-register endpoint to discovery:
{
reg_err
}
"
f
"Failed to re-register endpoint to discovery:
{
reg_err
}
"
)
)
return
{
return
{
...
@@ -145,9 +133,42 @@ class BaseWorkerHandler(ABC):
...
@@ -145,9 +133,42 @@ class BaseWorkerHandler(ABC):
"message"
:
f
"Memory resumed for tags:
{
tags
}
"
,
"message"
:
f
"Memory resumed for tags:
{
tags
}
"
,
}
}
except
Exception
as
e
:
except
Exception
as
e
:
logg
er
.
error
(
f
"Failed to resume memory occupation:
{
e
}
"
)
logg
ing
.
error
(
f
"Failed to resume memory occupation:
{
e
}
"
)
return
{
"status"
:
"error"
,
"message"
:
str
(
e
)}
return
{
"status"
:
"error"
,
"message"
:
str
(
e
)}
async
def
start_profile
(
self
,
body
:
dict
)
->
dict
:
"""Start profiling on the engine.
Args:
body: Dict with profiling parameters passed to start_profile.
"""
await
self
.
engine
.
tokenizer_manager
.
start_profile
(
**
body
)
return
{
"status"
:
"ok"
,
"message"
:
"Profiling started"
}
async
def
stop_profile
(
self
,
body
:
dict
)
->
dict
:
"""Stop profiling on the engine.
Args:
body: Unused, but required for handler signature.
"""
await
self
.
engine
.
tokenizer_manager
.
stop_profile
()
return
{
"status"
:
"ok"
,
"message"
:
"Profiling stopped"
}
def
register_engine_routes
(
self
,
runtime
)
->
None
:
"""Register all engine routes for this handler.
Args:
runtime: The DistributedRuntime instance to register routes on.
"""
runtime
.
register_engine_route
(
"start_profile"
,
self
.
start_profile
)
runtime
.
register_engine_route
(
"stop_profile"
,
self
.
stop_profile
)
runtime
.
register_engine_route
(
"release_memory_occupation"
,
self
.
release_memory_occupation
)
runtime
.
register_engine_route
(
"resume_memory_occupation"
,
self
.
resume_memory_occupation
)
@
abstractmethod
@
abstractmethod
async
def
generate
(
self
,
request
:
Dict
[
str
,
Any
],
context
:
Context
):
async
def
generate
(
self
,
request
:
Dict
[
str
,
Any
],
context
:
Context
):
"""Generate response from request.
"""Generate response from request.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment