Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
2e8c4447
Unverified
Commit
2e8c4447
authored
Jan 20, 2026
by
Schwinn Saereesitthipitak
Committed by
GitHub
Jan 20, 2026
Browse files
refactor: clean up SGLang sleep/wake implementation (#5517)
parent
04cecda7
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
41 additions
and
74 deletions
+41
-74
components/src/dynamo/sglang/main.py
components/src/dynamo/sglang/main.py
+2
-56
components/src/dynamo/sglang/request_handlers/handler_base.py
...onents/src/dynamo/sglang/request_handlers/handler_base.py
+39
-18
No files found.
components/src/dynamo/sglang/main.py
View file @
2e8c4447
...
@@ -124,23 +124,6 @@ async def init(runtime: DistributedRuntime, config: Config):
...
@@ -124,23 +124,6 @@ async def init(runtime: DistributedRuntime, config: Config):
await
_handle_non_leader_node
(
engine
,
generate_endpoint
)
await
_handle_non_leader_node
(
engine
,
generate_endpoint
)
return
return
# Register engine routes for profiling
async
def
start_profile_handler
(
body
:
dict
)
->
dict
:
"""Handle /engine/start_profile requests"""
await
engine
.
tokenizer_manager
.
start_profile
(
**
body
)
return
{
"status"
:
"ok"
,
"message"
:
"Profiling started"
}
async
def
stop_profile_handler
(
body
:
dict
)
->
dict
:
"""Handle /engine/stop_profile requests"""
await
engine
.
tokenizer_manager
.
stop_profile
()
return
{
"status"
:
"ok"
,
"message"
:
"Profiling stopped"
}
runtime
.
register_engine_route
(
"start_profile"
,
start_profile_handler
)
runtime
.
register_engine_route
(
"stop_profile"
,
stop_profile_handler
)
logging
.
info
(
"Registered engine routes: /engine/start_profile, /engine/stop_profile"
)
# publisher instantiates the metrics and kv event publishers
# publisher instantiates the metrics and kv event publishers
publisher
,
metrics_task
,
metrics_labels
=
await
setup_sgl_metrics
(
publisher
,
metrics_task
,
metrics_labels
=
await
setup_sgl_metrics
(
engine
,
config
,
component
,
generate_endpoint
engine
,
config
,
component
,
generate_endpoint
...
@@ -156,17 +139,7 @@ async def init(runtime: DistributedRuntime, config: Config):
...
@@ -156,17 +139,7 @@ async def init(runtime: DistributedRuntime, config: Config):
handler
=
DecodeWorkerHandler
(
handler
=
DecodeWorkerHandler
(
component
,
engine
,
config
,
publisher
,
generate_endpoint
component
,
engine
,
config
,
publisher
,
generate_endpoint
)
)
handler
.
register_engine_routes
(
runtime
)
# Register memory management routes using handler methods
runtime
.
register_engine_route
(
"release_memory_occupation"
,
handler
.
release_memory_occupation
)
runtime
.
register_engine_route
(
"resume_memory_occupation"
,
handler
.
resume_memory_occupation
)
logging
.
info
(
"Registered engine routes: /engine/release_memory_occupation, /engine/resume_memory_occupation"
)
print
(
f
"Config:
{
config
}
"
)
print
(
f
"Config:
{
config
}
"
)
health_check_payload
=
SglangHealthCheckPayload
(
health_check_payload
=
SglangHealthCheckPayload
(
...
@@ -238,23 +211,6 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
...
@@ -238,23 +211,6 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
await
_handle_non_leader_node
(
engine
,
generate_endpoint
)
await
_handle_non_leader_node
(
engine
,
generate_endpoint
)
return
return
# Register engine routes for profiling
async
def
start_profile_handler
(
body
:
dict
)
->
dict
:
"""Handle /engine/start_profile requests"""
await
engine
.
tokenizer_manager
.
start_profile
(
**
body
)
return
{
"status"
:
"ok"
,
"message"
:
"Profiling started"
}
async
def
stop_profile_handler
(
body
:
dict
)
->
dict
:
"""Handle /engine/stop_profile requests"""
await
engine
.
tokenizer_manager
.
stop_profile
()
return
{
"status"
:
"ok"
,
"message"
:
"Profiling stopped"
}
runtime
.
register_engine_route
(
"start_profile"
,
start_profile_handler
)
runtime
.
register_engine_route
(
"stop_profile"
,
stop_profile_handler
)
logging
.
info
(
"Registered engine routes: /engine/start_profile, /engine/stop_profile"
)
# Perform dummy warmup for prefill worker to avoid initial TTFT hit
# Perform dummy warmup for prefill worker to avoid initial TTFT hit
# Only needed on leader node that handles requests
# Only needed on leader node that handles requests
await
_warmup_prefill_engine
(
engine
,
server_args
)
await
_warmup_prefill_engine
(
engine
,
server_args
)
...
@@ -271,17 +227,7 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
...
@@ -271,17 +227,7 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
handler
=
PrefillWorkerHandler
(
handler
=
PrefillWorkerHandler
(
component
,
engine
,
config
,
publisher
,
generate_endpoint
component
,
engine
,
config
,
publisher
,
generate_endpoint
)
)
handler
.
register_engine_routes
(
runtime
)
# Register memory management routes using handler methods
runtime
.
register_engine_route
(
"release_memory_occupation"
,
handler
.
release_memory_occupation
)
runtime
.
register_engine_route
(
"resume_memory_occupation"
,
handler
.
resume_memory_occupation
)
logging
.
info
(
"Registered engine routes: /engine/release_memory_occupation, /engine/resume_memory_occupation"
)
health_check_payload
=
SglangPrefillHealthCheckPayload
(
engine
).
to_dict
()
health_check_payload
=
SglangPrefillHealthCheckPayload
(
engine
).
to_dict
()
...
...
components/src/dynamo/sglang/request_handlers/handler_base.py
View file @
2e8c4447
...
@@ -17,8 +17,6 @@ from dynamo.common.utils.input_params import InputParamManager
...
@@ -17,8 +17,6 @@ from dynamo.common.utils.input_params import InputParamManager
from
dynamo.sglang.args
import
Config
from
dynamo.sglang.args
import
Config
from
dynamo.sglang.publisher
import
DynamoSglangPublisher
from
dynamo.sglang.publisher
import
DynamoSglangPublisher
logger
=
logging
.
getLogger
(
__name__
)
class
BaseWorkerHandler
(
ABC
):
class
BaseWorkerHandler
(
ABC
):
"""Abstract base class for SGLang worker handlers."""
"""Abstract base class for SGLang worker handlers."""
...
@@ -80,28 +78,23 @@ class BaseWorkerHandler(ABC):
...
@@ -80,28 +78,23 @@ class BaseWorkerHandler(ABC):
# Step 1: Unregister endpoint from discovery FIRST
# Step 1: Unregister endpoint from discovery FIRST
try
:
try
:
await
self
.
generate_endpoint
.
unregister_endpoint_instance
()
await
self
.
generate_endpoint
.
unregister_endpoint_instance
()
logger
.
info
(
"[ReleaseMemory] Unregistered endpoint from discovery - worker removed from routing pool"
)
except
Exception
as
unreg_err
:
except
Exception
as
unreg_err
:
logg
er
.
warning
(
logg
ing
.
warning
(
f
"
[ReleaseMemory]
Failed to unregister endpoint from discovery:
{
unreg_err
}
"
f
"Failed to unregister endpoint from discovery:
{
unreg_err
}
"
)
)
# Step 2: Pause generation to drain in-flight requests
# Step 2: Pause generation to drain in-flight requests
await
self
.
engine
.
async_pause_generation
()
await
self
.
engine
.
async_pause_generation
()
logger
.
info
(
"[ReleaseMemory] Generation paused"
)
# Step 3: Release memory now that it's safe
# Step 3: Release memory now that it's safe
await
self
.
engine
.
async_release_memory_occupation
(
tags
)
await
self
.
engine
.
async_release_memory_occupation
(
tags
)
logger
.
info
(
f
"[ReleaseMemory] Released memory for tags:
{
tags
}
"
)
return
{
return
{
"status"
:
"ok"
,
"status"
:
"ok"
,
"message"
:
f
"Memory released for tags:
{
tags
}
"
,
"message"
:
f
"Memory released for tags:
{
tags
}
"
,
}
}
except
Exception
as
e
:
except
Exception
as
e
:
logg
er
.
error
(
f
"Failed to release memory occupation:
{
e
}
"
)
logg
ing
.
error
(
f
"Failed to release memory occupation:
{
e
}
"
)
return
{
"status"
:
"error"
,
"message"
:
str
(
e
)}
return
{
"status"
:
"error"
,
"message"
:
str
(
e
)}
async
def
resume_memory_occupation
(
self
,
body
:
dict
)
->
dict
:
async
def
resume_memory_occupation
(
self
,
body
:
dict
)
->
dict
:
...
@@ -123,21 +116,16 @@ class BaseWorkerHandler(ABC):
...
@@ -123,21 +116,16 @@ class BaseWorkerHandler(ABC):
try
:
try
:
# Step 1: Resume memory first - must be ready before accepting requests
# Step 1: Resume memory first - must be ready before accepting requests
await
self
.
engine
.
async_resume_memory_occupation
(
tags
)
await
self
.
engine
.
async_resume_memory_occupation
(
tags
)
logger
.
info
(
f
"[ResumeMemory] Resumed memory for tags:
{
tags
}
"
)
# Step 2: Continue generation
# Step 2: Continue generation
await
self
.
engine
.
async_continue_generation
()
await
self
.
engine
.
async_continue_generation
()
logger
.
info
(
"[ResumeMemory] Generation continued"
)
# Step 3: Re-register to discovery so frontend can route to us
# Step 3: Re-register to discovery so frontend can route to us
try
:
try
:
await
self
.
generate_endpoint
.
register_endpoint_instance
()
await
self
.
generate_endpoint
.
register_endpoint_instance
()
logger
.
info
(
"[ResumeMemory] Re-registered endpoint to discovery - worker added back to routing pool"
)
except
Exception
as
reg_err
:
except
Exception
as
reg_err
:
logg
er
.
warning
(
logg
ing
.
warning
(
f
"
[ResumeMemory]
Failed to re-register endpoint to discovery:
{
reg_err
}
"
f
"Failed to re-register endpoint to discovery:
{
reg_err
}
"
)
)
return
{
return
{
...
@@ -145,9 +133,42 @@ class BaseWorkerHandler(ABC):
...
@@ -145,9 +133,42 @@ class BaseWorkerHandler(ABC):
"message"
:
f
"Memory resumed for tags:
{
tags
}
"
,
"message"
:
f
"Memory resumed for tags:
{
tags
}
"
,
}
}
except
Exception
as
e
:
except
Exception
as
e
:
logg
er
.
error
(
f
"Failed to resume memory occupation:
{
e
}
"
)
logg
ing
.
error
(
f
"Failed to resume memory occupation:
{
e
}
"
)
return
{
"status"
:
"error"
,
"message"
:
str
(
e
)}
return
{
"status"
:
"error"
,
"message"
:
str
(
e
)}
async
def
start_profile
(
self
,
body
:
dict
)
->
dict
:
"""Start profiling on the engine.
Args:
body: Dict with profiling parameters passed to start_profile.
"""
await
self
.
engine
.
tokenizer_manager
.
start_profile
(
**
body
)
return
{
"status"
:
"ok"
,
"message"
:
"Profiling started"
}
async
def
stop_profile
(
self
,
body
:
dict
)
->
dict
:
"""Stop profiling on the engine.
Args:
body: Unused, but required for handler signature.
"""
await
self
.
engine
.
tokenizer_manager
.
stop_profile
()
return
{
"status"
:
"ok"
,
"message"
:
"Profiling stopped"
}
def
register_engine_routes
(
self
,
runtime
)
->
None
:
"""Register all engine routes for this handler.
Args:
runtime: The DistributedRuntime instance to register routes on.
"""
runtime
.
register_engine_route
(
"start_profile"
,
self
.
start_profile
)
runtime
.
register_engine_route
(
"stop_profile"
,
self
.
stop_profile
)
runtime
.
register_engine_route
(
"release_memory_occupation"
,
self
.
release_memory_occupation
)
runtime
.
register_engine_route
(
"resume_memory_occupation"
,
self
.
resume_memory_occupation
)
@
abstractmethod
@
abstractmethod
async
def
generate
(
self
,
request
:
Dict
[
str
,
Any
],
context
:
Context
):
async
def
generate
(
self
,
request
:
Dict
[
str
,
Any
],
context
:
Context
):
"""Generate response from request.
"""Generate response from request.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment