Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
f976f02b
Unverified
Commit
f976f02b
authored
Jan 26, 2026
by
Schwinn Saereesitthipitak
Committed by
GitHub
Jan 26, 2026
Browse files
refactor: rename sleep/wake endpoints for consistency (#5629)
parent
3d8c497e
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
45 additions
and
45 deletions
+45
-45
components/src/dynamo/vllm/handlers.py
components/src/dynamo/vllm/handlers.py
+2
-2
components/src/dynamo/vllm/main.py
components/src/dynamo/vllm/main.py
+6
-6
lib/gpu_memory_service/client/memory_manager.py
lib/gpu_memory_service/client/memory_manager.py
+37
-37
No files found.
components/src/dynamo/vllm/handlers.py
View file @
f976f02b
...
@@ -303,7 +303,7 @@ class BaseWorkerHandler(ABC):
...
@@ -303,7 +303,7 @@ class BaseWorkerHandler(ABC):
logger
.
error
(
f
"Failed to sleep engine:
{
e
}
"
)
logger
.
error
(
f
"Failed to sleep engine:
{
e
}
"
)
return
{
"status"
:
"error"
,
"message"
:
str
(
e
)}
return
{
"status"
:
"error"
,
"message"
:
str
(
e
)}
async
def
wake
(
self
,
body
:
dict
)
->
dict
:
async
def
wake
_up
(
self
,
body
:
dict
)
->
dict
:
"""Wake the engine to restore GPU memory and re-register to discovery.
"""Wake the engine to restore GPU memory and re-register to discovery.
Args:
Args:
...
@@ -331,7 +331,7 @@ class BaseWorkerHandler(ABC):
...
@@ -331,7 +331,7 @@ class BaseWorkerHandler(ABC):
return
{
"status"
:
"ok"
,
"message"
:
f
"Engine woke (tags=
{
tags
}
)"
}
return
{
"status"
:
"ok"
,
"message"
:
f
"Engine woke (tags=
{
tags
}
)"
}
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
error
(
f
"Failed to wake engine:
{
e
}
"
)
logger
.
error
(
f
"Failed to wake
up
engine:
{
e
}
"
)
return
{
"status"
:
"error"
,
"message"
:
str
(
e
)}
return
{
"status"
:
"error"
,
"message"
:
str
(
e
)}
@
abstractmethod
@
abstractmethod
...
...
components/src/dynamo/vllm/main.py
View file @
f976f02b
...
@@ -460,10 +460,10 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
...
@@ -460,10 +460,10 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
setup_metrics_collection
(
config
,
generate_endpoint
,
logger
)
setup_metrics_collection
(
config
,
generate_endpoint
,
logger
)
# Register sleep/wake engine routes
# Register sleep/wake
_up
engine routes
runtime
.
register_engine_route
(
"sleep"
,
handler
.
sleep
)
runtime
.
register_engine_route
(
"sleep"
,
handler
.
sleep
)
runtime
.
register_engine_route
(
"wake"
,
handler
.
wake
)
runtime
.
register_engine_route
(
"wake
_up
"
,
handler
.
wake
_up
)
logger
.
info
(
"Registered engine routes: /engine/sleep, /engine/wake"
)
logger
.
info
(
"Registered engine routes: /engine/sleep, /engine/wake
_up
"
)
# Handle non-leader nodes - don't serve endpoints
# Handle non-leader nodes - don't serve endpoints
if
config
.
engine_args
.
data_parallel_rank
:
if
config
.
engine_args
.
data_parallel_rank
:
...
@@ -585,10 +585,10 @@ async def init(runtime: DistributedRuntime, config: Config):
...
@@ -585,10 +585,10 @@ async def init(runtime: DistributedRuntime, config: Config):
setup_metrics_collection
(
config
,
generate_endpoint
,
logger
)
setup_metrics_collection
(
config
,
generate_endpoint
,
logger
)
# Register sleep/wake engine routes
# Register sleep/wake
_up
engine routes
runtime
.
register_engine_route
(
"sleep"
,
handler
.
sleep
)
runtime
.
register_engine_route
(
"sleep"
,
handler
.
sleep
)
runtime
.
register_engine_route
(
"wake"
,
handler
.
wake
)
runtime
.
register_engine_route
(
"wake
_up
"
,
handler
.
wake
_up
)
logger
.
info
(
"Registered engine routes: /engine/sleep, /engine/wake"
)
logger
.
info
(
"Registered engine routes: /engine/sleep, /engine/wake
_up
"
)
# Handle non-leader nodes - don't serve endpoints
# Handle non-leader nodes - don't serve endpoints
if
config
.
engine_args
.
data_parallel_rank
:
if
config
.
engine_args
.
data_parallel_rank
:
...
...
lib/gpu_memory_service/client/memory_manager.py
View file @
f976f02b
...
@@ -10,7 +10,7 @@ Key properties:
...
@@ -10,7 +10,7 @@ Key properties:
- The socket connection itself is the RW/RO lock.
- The socket connection itself is the RW/RO lock.
- In write mode, the manager can allocate + map RW and then publish via commit().
- In write mode, the manager can allocate + map RW and then publish via commit().
- In read mode, the manager can import + map RO and hold the RO lock during inference.
- In read mode, the manager can import + map RO and hold the RO lock during inference.
-
sleep()/wake
() releases and reacquires the RO lock (and remaps allocations).
-
unmap()/remap
() releases and reacquires the RO lock (and remaps allocations).
This module uses cuda-python bindings for CUDA driver API calls:
This module uses cuda-python bindings for CUDA driver API calls:
- import FDs (cuMemImportFromShareableHandle)
- import FDs (cuMemImportFromShareableHandle)
...
@@ -47,20 +47,20 @@ logger = logging.getLogger(__name__)
...
@@ -47,20 +47,20 @@ logger = logging.getLogger(__name__)
class
StaleMemoryLayoutError
(
Exception
):
class
StaleMemoryLayoutError
(
Exception
):
"""Raised when memory layout was modified while
sleeping
.
"""Raised when memory layout was modified while
unmapped
.
This error indicates that a writer acquired the RW lock and changed the
This error indicates that a writer acquired the RW lock and changed the
allocation structure (different sizes, different tensor layouts) while this
allocation structure (different sizes, different tensor layouts) while this
reader was
sleeping
. The caller should re-import the model from scratch.
reader was
unmapped
. The caller should re-import the model from scratch.
IMPORTANT: This is a LAYOUT check, NOT a CONTENT check.
IMPORTANT: This is a LAYOUT check, NOT a CONTENT check.
- Detected: Allocation sizes changed, tensors added/removed, metadata structure changed
- Detected: Allocation sizes changed, tensors added/removed, metadata structure changed
- NOT detected: Weight values modified in-place
- NOT detected: Weight values modified in-place
This design is intentional:
sleep/wake
enables use cases like RL training
This design is intentional:
unmap/remap
enables use cases like RL training
where another process can write to the same memory locations (e.g., updating
where another process can write to the same memory locations (e.g., updating
weights) while preserving the structure. As long as the layout (allocation
weights) while preserving the structure. As long as the layout (allocation
and metadata table hashes) remains identical,
wake
() succeeds.
and metadata table hashes) remains identical,
remap
() succeeds.
"""
"""
pass
pass
...
@@ -106,7 +106,7 @@ class GMSClientMemoryManager:
...
@@ -106,7 +106,7 @@ class GMSClientMemoryManager:
Modes:
Modes:
- mode=RequestedLockType.RW: acquire RW lock, allocate/map RW, mutate metadata, commit/publish.
- mode=RequestedLockType.RW: acquire RW lock, allocate/map RW, mutate metadata, commit/publish.
- mode=RequestedLockType.RO: acquire RO lock (READY only), import/map RO,
sleep/wake
.
- mode=RequestedLockType.RO: acquire RO lock (READY only), import/map RO,
unmap/remap
.
- mode=RequestedLockType.RW_OR_RO: try RW if available, else wait for RO.
- mode=RequestedLockType.RW_OR_RO: try RW if available, else wait for RO.
"""
"""
...
@@ -126,13 +126,13 @@ class GMSClientMemoryManager:
...
@@ -126,13 +126,13 @@ class GMSClientMemoryManager:
self
.
_mappings
:
Dict
[
int
,
LocalMapping
]
=
{}
# va -> mapping
self
.
_mappings
:
Dict
[
int
,
LocalMapping
]
=
{}
# va -> mapping
self
.
_allocation_id_to_va
:
Dict
[
str
,
int
]
=
{}
self
.
_allocation_id_to_va
:
Dict
[
str
,
int
]
=
{}
self
.
_
sleeping
=
False
self
.
_
unmapped
=
False
self
.
_closed
=
False
self
.
_closed
=
False
self
.
_preserved_allocation_ids
:
List
[
str
]
=
[]
self
.
_preserved_allocation_ids
:
List
[
str
]
=
[]
self
.
_published
=
False
self
.
_published
=
False
self
.
_mode
:
Optional
[
GrantedLockType
]
=
None
# Updated by _connect
self
.
_mode
:
Optional
[
GrantedLockType
]
=
None
# Updated by _connect
# VA-stable
sleep/wake
state
# VA-stable
unmap/remap
state
self
.
_va_preserved
=
False
self
.
_va_preserved
=
False
self
.
_last_memory_layout_hash
:
str
=
(
self
.
_last_memory_layout_hash
:
str
=
(
""
# Hash from server, saved on connect/commit
""
# Hash from server, saved on connect/commit
...
@@ -157,10 +157,10 @@ class GMSClientMemoryManager:
...
@@ -157,10 +157,10 @@ class GMSClientMemoryManager:
self
.
_client
=
GMSRPCClient
(
self
.
_client
=
GMSRPCClient
(
self
.
socket_path
,
lock_type
=
lock_type
,
timeout_ms
=
timeout_ms
self
.
socket_path
,
lock_type
=
lock_type
,
timeout_ms
=
timeout_ms
)
)
self
.
_
sleeping
=
False
self
.
_
unmapped
=
False
# Update mode based on granted lock type (may differ from requested for rw_or_ro)
# Update mode based on granted lock type (may differ from requested for rw_or_ro)
self
.
_mode
=
self
.
_client
.
lock_type
self
.
_mode
=
self
.
_client
.
lock_type
# Save state hash for stale detection on
wake
(skip during
wake
itself)
# Save state hash for stale detection on
remap
(skip during
remap
itself)
if
update_memory_layout_hash
and
self
.
_client
.
committed
:
if
update_memory_layout_hash
and
self
.
_client
.
committed
:
self
.
_last_memory_layout_hash
=
self
.
_client
.
get_memory_layout_hash
()
self
.
_last_memory_layout_hash
=
self
.
_client
.
get_memory_layout_hash
()
...
@@ -181,8 +181,8 @@ class GMSClientMemoryManager:
...
@@ -181,8 +181,8 @@ class GMSClientMemoryManager:
return
self
.
_client
is
not
None
and
self
.
_client
.
is_connected
return
self
.
_client
is
not
None
and
self
.
_client
.
is_connected
@
property
@
property
def
is_
sleeping
(
self
)
->
bool
:
def
is_
unmapped
(
self
)
->
bool
:
return
self
.
_
sleeping
return
self
.
_
unmapped
@
property
@
property
def
mappings
(
self
)
->
Dict
[
int
,
LocalMapping
]:
def
mappings
(
self
)
->
Dict
[
int
,
LocalMapping
]:
...
@@ -366,9 +366,9 @@ class GMSClientMemoryManager:
...
@@ -366,9 +366,9 @@ class GMSClientMemoryManager:
"""
"""
if
self
.
_closed
:
if
self
.
_closed
:
raise
RuntimeError
(
"Memory manager is closed"
)
raise
RuntimeError
(
"Memory manager is closed"
)
if
self
.
_
sleeping
:
if
self
.
_
unmapped
:
raise
RuntimeError
(
raise
RuntimeError
(
"Cannot switch_to_read() while
sleeping
; call
wake
() first"
"Cannot switch_to_read() while
unmapped
; call
remap
() first"
)
)
if
self
.
_client
is
not
None
:
if
self
.
_client
is
not
None
:
if
self
.
lock_type
==
GrantedLockType
.
RO
:
if
self
.
lock_type
==
GrantedLockType
.
RO
:
...
@@ -380,25 +380,25 @@ class GMSClientMemoryManager:
...
@@ -380,25 +380,25 @@ class GMSClientMemoryManager:
eff_timeout
=
timeout_ms
if
timeout_ms
is
not
None
else
self
.
_timeout_ms
eff_timeout
=
timeout_ms
if
timeout_ms
is
not
None
else
self
.
_timeout_ms
self
.
_connect
(
lock_type
=
RequestedLockType
.
RO
,
timeout_ms
=
eff_timeout
)
self
.
_connect
(
lock_type
=
RequestedLockType
.
RO
,
timeout_ms
=
eff_timeout
)
# ====================
Sleep / wake
(read mode) ====================
# ====================
Unmap / remap
(read mode) ====================
def
slee
p
(
self
)
->
None
:
def
unma
p
(
self
)
->
None
:
"""Release RO lock and unmap local allocations (VA-stable).
"""Release RO lock and unmap local allocations (VA-stable).
VAs are preserved during
slee
p so tensor pointers remain stable.
VAs are preserved during
unma
p so tensor pointers remain stable.
On
wake
, allocations are remapped to the same VAs.
On
remap
, allocations are remapped to the same VAs.
"""
"""
if
self
.
_closed
:
if
self
.
_closed
:
raise
RuntimeError
(
"Memory manager is closed"
)
raise
RuntimeError
(
"Memory manager is closed"
)
if
self
.
_
sleeping
:
if
self
.
_
unmapped
:
return
return
if
self
.
lock_type
!=
GrantedLockType
.
RO
:
if
self
.
lock_type
!=
GrantedLockType
.
RO
:
raise
RuntimeError
(
"
slee
p() requires RO mode"
)
raise
RuntimeError
(
"
unma
p() requires RO mode"
)
if
torch
.
cuda
.
is_available
():
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
synchronize
(
self
.
device
)
torch
.
cuda
.
synchronize
(
self
.
device
)
# Preserve allocation IDs for remapping on
wake
# Preserve allocation IDs for remapping on
remap
self
.
_preserved_allocation_ids
=
list
(
self
.
_allocation_id_to_va
.
keys
())
self
.
_preserved_allocation_ids
=
list
(
self
.
_allocation_id_to_va
.
keys
())
# Unmap physical memory but keep VA reservations
# Unmap physical memory but keep VA reservations
...
@@ -407,12 +407,12 @@ class GMSClientMemoryManager:
...
@@ -407,12 +407,12 @@ class GMSClientMemoryManager:
self
.
_client_rpc
.
close
()
self
.
_client_rpc
.
close
()
self
.
_client
=
None
self
.
_client
=
None
self
.
_
sleeping
=
True
self
.
_
unmapped
=
True
def
wake
(
self
,
timeout_ms
:
Optional
[
int
]
=
None
)
->
bool
:
def
remap
(
self
,
timeout_ms
:
Optional
[
int
]
=
None
)
->
bool
:
"""Reacquire RO lock and remap preserved allocations (VA-stable).
"""Reacquire RO lock and remap preserved allocations (VA-stable).
Allocations are remapped to the same VAs they had before
slee
p,
Allocations are remapped to the same VAs they had before
unma
p,
ensuring tensor pointers remain valid.
ensuring tensor pointers remain valid.
Args:
Args:
...
@@ -423,11 +423,11 @@ class GMSClientMemoryManager:
...
@@ -423,11 +423,11 @@ class GMSClientMemoryManager:
Raises:
Raises:
TimeoutError: If timeout_ms expires waiting for RO lock.
TimeoutError: If timeout_ms expires waiting for RO lock.
StaleMemoryLayoutError: If weights were structurally changed while
sleeping
.
StaleMemoryLayoutError: If weights were structurally changed while
unmapped
.
"""
"""
if
self
.
_closed
:
if
self
.
_closed
:
raise
RuntimeError
(
"Memory manager is closed"
)
raise
RuntimeError
(
"Memory manager is closed"
)
if
not
self
.
_
sleeping
:
if
not
self
.
_
unmapped
:
return
True
return
True
if
torch
.
cuda
.
is_available
():
if
torch
.
cuda
.
is_available
():
...
@@ -440,14 +440,14 @@ class GMSClientMemoryManager:
...
@@ -440,14 +440,14 @@ class GMSClientMemoryManager:
update_memory_layout_hash
=
False
,
update_memory_layout_hash
=
False
,
)
)
# Check if memory layout changed while
sleeping
# Check if memory layout changed while
unmapped
current_hash
=
self
.
_client_rpc
.
get_memory_layout_hash
()
current_hash
=
self
.
_client_rpc
.
get_memory_layout_hash
()
if
(
if
(
self
.
_last_memory_layout_hash
self
.
_last_memory_layout_hash
and
current_hash
!=
self
.
_last_memory_layout_hash
and
current_hash
!=
self
.
_last_memory_layout_hash
):
):
raise
StaleMemoryLayoutError
(
raise
StaleMemoryLayoutError
(
f
"State changed while
sleeping
: hash
{
self
.
_last_memory_layout_hash
[:
16
]
}
... ->
{
current_hash
[:
16
]
}
..."
f
"State changed while
unmapped
: hash
{
self
.
_last_memory_layout_hash
[:
16
]
}
... ->
{
current_hash
[:
16
]
}
..."
)
)
# Remap to preserved VAs
# Remap to preserved VAs
...
@@ -469,16 +469,16 @@ class GMSClientMemoryManager:
...
@@ -469,16 +469,16 @@ class GMSClientMemoryManager:
if
failed_count
>
0
:
if
failed_count
>
0
:
raise
RuntimeError
(
raise
RuntimeError
(
f
"
Wake
failed:
{
failed_count
}
of
{
len
(
self
.
_preserved_allocation_ids
)
}
"
f
"
Remap
failed:
{
failed_count
}
of
{
len
(
self
.
_preserved_allocation_ids
)
}
"
f
"allocations could not be remapped"
f
"allocations could not be remapped"
)
)
logger
.
info
(
logger
.
info
(
f
"[GPU Memory Service]
Wake
complete on device
{
self
.
device
}
: "
f
"[GPU Memory Service]
Remap
complete on device
{
self
.
device
}
: "
f
"remapped
{
remapped_count
}
allocations (
{
total_bytes
/
(
1
<<
30
):.
2
f
}
GiB)"
f
"remapped
{
remapped_count
}
allocations (
{
total_bytes
/
(
1
<<
30
):.
2
f
}
GiB)"
)
)
self
.
_
sleeping
=
False
self
.
_
unmapped
=
False
self
.
_va_preserved
=
False
self
.
_va_preserved
=
False
return
True
return
True
...
@@ -499,7 +499,7 @@ class GMSClientMemoryManager:
...
@@ -499,7 +499,7 @@ class GMSClientMemoryManager:
self
.
_client
.
close
()
self
.
_client
.
close
()
self
.
_client
=
None
self
.
_client
=
None
self
.
_closed
=
True
self
.
_closed
=
True
self
.
_
sleeping
=
False
self
.
_
unmapped
=
False
self
.
_va_preserved
=
False
self
.
_va_preserved
=
False
self
.
_preserved_allocation_ids
.
clear
()
self
.
_preserved_allocation_ids
.
clear
()
...
@@ -515,8 +515,8 @@ class GMSClientMemoryManager:
...
@@ -515,8 +515,8 @@ class GMSClientMemoryManager:
def
_client_rpc
(
self
)
->
GMSRPCClient
:
def
_client_rpc
(
self
)
->
GMSRPCClient
:
"""Get connected client or raise. Use instead of _require_connected() + assert."""
"""Get connected client or raise. Use instead of _require_connected() + assert."""
if
self
.
_client
is
None
:
if
self
.
_client
is
None
:
if
self
.
_
sleeping
:
if
self
.
_
unmapped
:
raise
RuntimeError
(
"Memory manager is
sleeping
"
)
raise
RuntimeError
(
"Memory manager is
unmapped
"
)
raise
RuntimeError
(
"Memory manager is not connected"
)
raise
RuntimeError
(
"Memory manager is not connected"
)
return
self
.
_client
return
self
.
_client
...
@@ -530,10 +530,10 @@ class GMSClientMemoryManager:
...
@@ -530,10 +530,10 @@ class GMSClientMemoryManager:
self
.
_allocation_id_to_va
[
m
.
allocation_id
]
=
m
.
va
self
.
_allocation_id_to_va
[
m
.
allocation_id
]
=
m
.
va
def
_unmap_preserving_va
(
self
)
->
None
:
def
_unmap_preserving_va
(
self
)
->
None
:
"""Unmap physical memory but PRESERVE VA reservations for
sleep/wake
.
"""Unmap physical memory but PRESERVE VA reservations for
unmap/remap
.
This keeps the VA reservation intact so tensors maintain stable pointers.
This keeps the VA reservation intact so tensors maintain stable pointers.
On
wake
, we can remap to the same VAs.
On
remap
, we can remap to the same VAs.
"""
"""
unmapped_count
=
0
unmapped_count
=
0
total_bytes
=
0
total_bytes
=
0
...
@@ -560,7 +560,7 @@ class GMSClientMemoryManager:
...
@@ -560,7 +560,7 @@ class GMSClientMemoryManager:
def
_remap_preserved_va
(
self
,
allocation_id
:
str
)
->
int
:
def
_remap_preserved_va
(
self
,
allocation_id
:
str
)
->
int
:
"""Remap an allocation to its preserved VA.
"""Remap an allocation to its preserved VA.
Requires the VA to already be reserved (from before
slee
p).
Requires the VA to already be reserved (from before
unma
p).
Validates allocation still exists and size matches.
Validates allocation still exists and size matches.
Returns the VA.
Returns the VA.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment