Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
bd91dca6
"lib/llm/src/vscode:/vscode.git/clone" did not exist on "95383fd61baae7660f322752e3da14168dce2af4"
Unverified
Commit
bd91dca6
authored
Jul 07, 2025
by
ishandhanani
Committed by
GitHub
Jul 07, 2025
Browse files
feat: add flush_cache endpoint to sglang (#1769)
parent
b2044566
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
183 additions
and
3 deletions
+183
-3
examples/sglang/components/decode_worker.py
examples/sglang/components/decode_worker.py
+15
-2
examples/sglang/components/worker.py
examples/sglang/components/worker.py
+14
-1
examples/sglang/utils/sgl_http_server.py
examples/sglang/utils/sgl_http_server.py
+154
-0
No files found.
examples/sglang/components/decode_worker.py
View file @
bd91dca6
...
@@ -54,6 +54,14 @@ class DecodeRequestHandler:
...
@@ -54,6 +54,14 @@ class DecodeRequestHandler:
async
for
result
in
results
:
async
for
result
in
results
:
yield
result
yield
result
async
def
flush_cache
(
self
,
request
:
dict
):
_
=
request
asyncio
.
create_task
(
self
.
engine
.
tokenizer_manager
.
flush_cache
())
yield
{
"status"
:
"success"
,
"message"
:
"Cache flush initiated. Check backend logs for status"
,
}
async
def
graceful_shutdown
(
runtime
):
async
def
graceful_shutdown
(
runtime
):
logging
.
info
(
"Received shutdown signal, shutting down DistributedRuntime"
)
logging
.
info
(
"Received shutdown signal, shutting down DistributedRuntime"
)
...
@@ -89,8 +97,13 @@ async def init(runtime: DistributedRuntime, server_args: ServerArgs):
...
@@ -89,8 +97,13 @@ async def init(runtime: DistributedRuntime, server_args: ServerArgs):
component
=
runtime
.
namespace
(
"dynamo"
).
component
(
"decode"
)
component
=
runtime
.
namespace
(
"dynamo"
).
component
(
"decode"
)
await
component
.
create_service
()
await
component
.
create_service
()
endpoint
=
component
.
endpoint
(
"generate"
)
gen_endpoint
=
component
.
endpoint
(
"generate"
)
await
endpoint
.
serve_endpoint
(
handler
.
generate
)
flush_endpoint
=
component
.
endpoint
(
"flush_cache"
)
tasks
=
[
gen_endpoint
.
serve_endpoint
(
handler
.
generate
)]
tasks
.
append
(
flush_endpoint
.
serve_endpoint
(
handler
.
flush_cache
))
await
asyncio
.
gather
(
*
tasks
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
examples/sglang/components/worker.py
View file @
bd91dca6
...
@@ -242,6 +242,14 @@ class RequestHandler:
...
@@ -242,6 +242,14 @@ class RequestHandler:
async
for
_
in
prefill
:
async
for
_
in
prefill
:
pass
pass
async
def
flush_cache
(
self
,
request
:
dict
):
_
=
request
asyncio
.
create_task
(
self
.
engine
.
tokenizer_manager
.
flush_cache
())
yield
{
"status"
:
"success"
,
"message"
:
"Cache flush initiated. Check backend logs for status"
,
}
async
def
graceful_shutdown
(
runtime
):
async
def
graceful_shutdown
(
runtime
):
logging
.
info
(
"Received shutdown signal, shutting down DistributedRuntime"
)
logging
.
info
(
"Received shutdown signal, shutting down DistributedRuntime"
)
...
@@ -305,7 +313,12 @@ async def init(runtime: DistributedRuntime, server_args: ServerArgs):
...
@@ -305,7 +313,12 @@ async def init(runtime: DistributedRuntime, server_args: ServerArgs):
)
)
_
=
ZmqKvEventPublisher
(
component
=
component
,
config
=
zmq_config
)
_
=
ZmqKvEventPublisher
(
component
=
component
,
config
=
zmq_config
)
await
endpoint
.
serve_endpoint
(
handler
.
generate
)
tasks
=
[
endpoint
.
serve_endpoint
(
handler
.
generate
)]
flush_endpoint
=
component
.
endpoint
(
"flush_cache"
)
tasks
.
append
(
flush_endpoint
.
serve_endpoint
(
handler
.
flush_cache
))
await
asyncio
.
gather
(
*
tasks
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
examples/sglang/utils/sgl_http_server.py
0 → 100644
View file @
bd91dca6
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import
argparse
import
asyncio
import
logging
import
uvicorn
import
uvloop
from
fastapi
import
FastAPI
from
dynamo.runtime
import
DistributedRuntime
,
dynamo_worker
from
dynamo.runtime.logging
import
configure_dynamo_logging
FLUSH_CACHE_ENDPOINT
=
"flush_cache"
configure_dynamo_logging
()
class
SglangHttpServer
:
def
__init__
(
self
,
port
:
int
,
runtime
:
DistributedRuntime
,
args
):
self
.
port
=
port
self
.
app
=
FastAPI
()
self
.
runtime
=
runtime
self
.
args
=
args
self
.
setup_routes
()
async
def
_discover_endpoints
(
self
):
"""Discover endpoints that match the pattern"""
etcd_client
=
self
.
runtime
.
etcd_client
()
if
etcd_client
is
None
:
raise
RuntimeError
(
"Runtime has no etcd client; cannot discover endpoints"
)
prefix
=
"instances/"
kvs
=
await
etcd_client
.
kv_get_prefix
(
prefix
)
# Collect (namespace, component) combos that expose flush_cache
discovered
=
set
()
for
kv
in
kvs
:
key
=
kv
[
"key"
]
if
isinstance
(
kv
,
dict
)
else
kv
.
key
if
isinstance
(
key
,
bytes
):
key
=
key
.
decode
()
if
not
key
.
startswith
(
prefix
):
continue
segments
=
key
.
split
(
"/"
)
# Format: instances/<ns>/<comp>/<endpoint:lease>
if
len
(
segments
)
<
4
:
continue
ns
,
comp
,
ep_with_lease
=
segments
[
1
],
segments
[
2
],
segments
[
3
]
if
self
.
args
.
ns
and
ns
!=
self
.
args
.
ns
:
continue
if
self
.
args
.
comp
and
comp
!=
self
.
args
.
comp
:
continue
ep_name
=
ep_with_lease
.
split
(
":"
,
1
)[
0
]
if
ep_name
==
self
.
args
.
endpoint
:
discovered
.
add
((
ns
,
comp
))
logging
.
debug
(
f
"Discovered endpoint:
{
ns
}
.
{
comp
}
"
)
logging
.
debug
(
f
"Endpoint discovery complete. Found
{
len
(
discovered
)
}
matching endpoints"
)
return
discovered
def
setup_routes
(
self
):
@
self
.
app
.
post
(
"/flush_cache"
)
async
def
flush_cache
():
"""Flush the radix cache."""
try
:
discovered
=
await
self
.
_discover_endpoints
()
if
not
discovered
:
return
{
"message"
:
"No matching endpoints found"
,
"success"
:
False
}
logging
.
debug
(
f
"Found components:
{
', '
.
join
([
f
'
{
ns
}
.
{
comp
}
' for ns, comp in discovered])
}
"
)
for
ns
,
comp
in
discovered
:
ep
=
(
self
.
runtime
.
namespace
(
ns
)
.
component
(
comp
)
.
endpoint
(
self
.
args
.
endpoint
)
)
client
=
await
ep
.
client
()
await
client
.
wait_for_instances
()
ids
=
client
.
instance_ids
()
logging
.
debug
(
f
"--
{
ns
}
.
{
comp
}
:
{
len
(
ids
)
}
instances --"
)
for
inst_id
in
ids
:
try
:
stream
=
await
client
.
direct
(
"{}"
,
inst_id
)
async
for
payload
in
stream
:
logging
.
debug
(
f
"[
{
ns
}
.
{
comp
}
][
{
inst_id
}
] ->
{
payload
}
"
)
except
Exception
as
e
:
logging
.
error
(
f
"[
{
ns
}
.
{
comp
}
][
{
inst_id
}
] flush error:
{
e
}
"
)
return
{
"message"
:
"Cache flush initiated"
,
"success"
:
True
}
except
Exception
as
e
:
logging
.
error
(
f
"Cache flush error:
{
e
}
"
)
return
{
"message"
:
f
"Cache flush failed:
{
str
(
e
)
}
"
,
"success"
:
False
}
async
def
start_server
(
self
):
"""Start the HTTP server"""
config
=
uvicorn
.
Config
(
self
.
app
,
host
=
"0.0.0.0"
,
port
=
self
.
port
,
)
server
=
uvicorn
.
Server
(
config
)
# Single nice log with available endpoints
logging
.
info
(
f
"🚀 SGL engine HTTP server running on http://0.0.0.0:
{
self
.
port
}
- Endpoints: POST /flush_cache"
)
await
server
.
serve
()
def
parse_args
():
p
=
argparse
.
ArgumentParser
(
description
=
"SGLang HTTP server for cache management"
)
p
.
add_argument
(
"--port"
,
type
=
int
,
default
=
9001
,
help
=
"Port to listen on"
)
p
.
add_argument
(
"--ns"
,
"--namespace"
,
default
=
"dynamo"
,
help
=
"Specify Dynamo namespace (default: discover all)"
,
)
p
.
add_argument
(
"--comp"
,
"--component"
,
default
=
None
,
help
=
"Specify component name (default: discover all)"
,
)
p
.
add_argument
(
"--endpoint"
,
default
=
FLUSH_CACHE_ENDPOINT
,
help
=
"Specify endpoint name"
)
return
p
.
parse_args
()
@
dynamo_worker
(
static
=
False
)
async
def
main
(
runtime
:
DistributedRuntime
):
args
=
parse_args
()
http_server
=
SglangHttpServer
(
args
.
port
,
runtime
,
args
)
await
http_server
.
start_server
()
if
__name__
==
"__main__"
:
uvloop
.
install
()
asyncio
.
run
(
main
())
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment