Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
fde9b963
Unverified
Commit
fde9b963
authored
Oct 07, 2025
by
Simo Lin
Committed by
GitHub
Oct 07, 2025
Browse files
[router] cleanup worker health check to return early (#11310)
parent
f094e0a4
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
133 additions
and
133 deletions
+133
-133
sgl-router/src/core/worker.rs
sgl-router/src/core/worker.rs
+75
-67
sgl-router/src/core/worker_manager.rs
sgl-router/src/core/worker_manager.rs
+58
-66
No files found.
sgl-router/src/core/worker.rs
View file @
fde9b963
...
@@ -9,11 +9,14 @@ use serde_json;
...
@@ -9,11 +9,14 @@ use serde_json;
use
std
::
fmt
;
use
std
::
fmt
;
use
std
::
sync
::
atomic
::{
AtomicBool
,
AtomicUsize
,
Ordering
};
use
std
::
sync
::
atomic
::{
AtomicBool
,
AtomicUsize
,
Ordering
};
use
std
::
sync
::{
Arc
,
LazyLock
};
use
std
::
sync
::{
Arc
,
LazyLock
};
use
std
::
time
::
Duration
;
use
std
::
time
::
Instant
;
use
tokio
::
sync
::{
Mutex
,
RwLock
};
use
tokio
::
sync
::{
Mutex
,
RwLock
};
use
tokio
::
time
;
static
WORKER_CLIENT
:
LazyLock
<
reqwest
::
Client
>
=
LazyLock
::
new
(||
{
static
WORKER_CLIENT
:
LazyLock
<
reqwest
::
Client
>
=
LazyLock
::
new
(||
{
reqwest
::
Client
::
builder
()
reqwest
::
Client
::
builder
()
.timeout
(
std
::
time
::
Duration
::
from_secs
(
30
))
.timeout
(
Duration
::
from_secs
(
30
))
.build
()
.build
()
.expect
(
"Failed to create worker HTTP client"
)
.expect
(
"Failed to create worker HTTP client"
)
});
});
...
@@ -227,6 +230,8 @@ pub trait Worker: Send + Sync + fmt::Debug {
...
@@ -227,6 +230,8 @@ pub trait Worker: Send + Sync + fmt::Debug {
async
fn
reset_grpc_client
(
&
self
)
->
WorkerResult
<
()
>
{
async
fn
reset_grpc_client
(
&
self
)
->
WorkerResult
<
()
>
{
Ok
(())
Ok
(())
}
}
async
fn
grpc_health_check
(
&
self
)
->
WorkerResult
<
bool
>
;
async
fn
http_health_check
(
&
self
)
->
WorkerResult
<
bool
>
;
}
}
/// Connection mode for worker communication
/// Connection mode for worker communication
...
@@ -407,66 +412,9 @@ impl Worker for BasicWorker {
...
@@ -407,66 +412,9 @@ impl Worker for BasicWorker {
}
}
async
fn
check_health_async
(
&
self
)
->
WorkerResult
<
()
>
{
async
fn
check_health_async
(
&
self
)
->
WorkerResult
<
()
>
{
use
std
::
time
::
Duration
;
let
health_result
=
match
&
self
.metadata.connection_mode
{
let
health_result
=
match
&
self
.metadata.connection_mode
{
ConnectionMode
::
Http
=>
{
ConnectionMode
::
Http
=>
self
.http_health_check
()
.await
?
,
let
url
=
self
.normalised_url
()
?
;
ConnectionMode
::
Grpc
{
..
}
=>
self
.grpc_health_check
()
.await
?
,
let
health_url
=
format!
(
"{}{}"
,
url
,
self
.metadata.health_config.endpoint
);
let
timeout
=
Duration
::
from_secs
(
self
.metadata.health_config.timeout_secs
);
let
mut
request
=
WORKER_CLIENT
.get
(
&
health_url
)
.timeout
(
timeout
);
if
let
Some
(
ref
api_key
)
=
self
.metadata.api_key
{
request
=
request
.header
(
"Authorization"
,
format!
(
"Bearer {}"
,
api_key
));
}
match
request
.send
()
.await
{
Ok
(
response
)
=>
response
.status
()
.is_success
(),
Err
(
_
)
=>
false
,
}
}
ConnectionMode
::
Grpc
{
..
}
=>
{
// Use the new get_grpc_client() method for lazy initialization
match
self
.get_grpc_client
()
.await
{
Ok
(
Some
(
grpc_client
))
=>
{
let
mut
client
=
grpc_client
.lock
()
.await
;
match
client
.health_check
()
.await
{
Ok
(
response
)
=>
{
tracing
::
debug!
(
"gRPC health check succeeded for {}: healthy={}"
,
self
.metadata.url
,
response
.healthy
);
response
.healthy
}
Err
(
e
)
=>
{
tracing
::
warn!
(
"gRPC health check RPC failed for {}: {:?}"
,
self
.metadata.url
,
e
);
false
}
}
}
Ok
(
None
)
=>
{
tracing
::
error!
(
"Worker {} is not a gRPC worker but has gRPC connection mode"
,
self
.metadata.url
);
false
}
Err
(
e
)
=>
{
tracing
::
error!
(
"Failed to get gRPC client for worker {}: {:?}"
,
self
.metadata.url
,
e
);
false
}
}
}
};
};
if
health_result
{
if
health_result
{
...
@@ -594,6 +542,61 @@ impl Worker for BasicWorker {
...
@@ -594,6 +542,61 @@ impl Worker for BasicWorker {
}
}
}
}
}
}
async
fn
grpc_health_check
(
&
self
)
->
WorkerResult
<
bool
>
{
let
timeout
=
Duration
::
from_secs
(
self
.metadata.health_config.timeout_secs
);
let
maybe
=
self
.get_grpc_client
()
.await
?
;
let
Some
(
grpc_client
)
=
maybe
else
{
tracing
::
error!
(
"Worker {} is not a gRPC worker but connection mode is gRPC"
,
self
.metadata.url
);
return
Ok
(
false
);
};
let
mut
client
=
grpc_client
.lock
()
.await
;
match
time
::
timeout
(
timeout
,
client
.health_check
())
.await
{
Ok
(
Ok
(
resp
))
=>
{
tracing
::
debug!
(
"gRPC health OK for {}: healthy={}"
,
self
.metadata.url
,
resp
.healthy
);
Ok
(
resp
.healthy
)
}
Ok
(
Err
(
err
))
=>
{
tracing
::
warn!
(
"gRPC health RPC error for {}: {err:?}"
,
self
.metadata.url
);
Ok
(
false
)
}
Err
(
_
)
=>
{
tracing
::
warn!
(
"gRPC health timed out for {}"
,
self
.metadata.url
);
Ok
(
false
)
}
}
}
async
fn
http_health_check
(
&
self
)
->
WorkerResult
<
bool
>
{
let
timeout
=
Duration
::
from_secs
(
self
.metadata.health_config.timeout_secs
);
let
url
=
self
.normalised_url
()
?
;
let
health_url
=
format!
(
"{}{}"
,
url
,
self
.metadata.health_config.endpoint
);
let
mut
req
=
WORKER_CLIENT
.get
(
health_url
)
.timeout
(
timeout
);
if
let
Some
(
api_key
)
=
&
self
.metadata.api_key
{
req
=
req
.bearer_auth
(
api_key
);
}
match
req
.send
()
.await
{
Ok
(
resp
)
=>
Ok
(
resp
.status
()
.is_success
()),
Err
(
err
)
=>
{
tracing
::
warn!
(
"HTTP health check failed for {}: {err:?}"
,
self
.metadata.url
);
Ok
(
false
)
}
}
}
}
}
/// A DP-aware worker that handles data-parallel routing
/// A DP-aware worker that handles data-parallel routing
...
@@ -730,6 +733,14 @@ impl Worker for DPAwareWorker {
...
@@ -730,6 +733,14 @@ impl Worker for DPAwareWorker {
async
fn
reset_grpc_client
(
&
self
)
->
WorkerResult
<
()
>
{
async
fn
reset_grpc_client
(
&
self
)
->
WorkerResult
<
()
>
{
self
.base_worker
.reset_grpc_client
()
.await
self
.base_worker
.reset_grpc_client
()
.await
}
}
async
fn
grpc_health_check
(
&
self
)
->
WorkerResult
<
bool
>
{
self
.base_worker
.grpc_health_check
()
.await
}
async
fn
http_health_check
(
&
self
)
->
WorkerResult
<
bool
>
{
self
.base_worker
.http_health_check
()
.await
}
}
}
/// Worker factory for creating workers of different types
/// Worker factory for creating workers of different types
...
@@ -755,10 +766,8 @@ impl WorkerFactory {
...
@@ -755,10 +766,8 @@ impl WorkerFactory {
/// Static health validation before creating a worker
/// Static health validation before creating a worker
/// This replaces wait_for_worker_health in handlers
/// This replaces wait_for_worker_health in handlers
pub
async
fn
validate_health
(
url
:
&
str
,
timeout_secs
:
u64
)
->
WorkerResult
<
()
>
{
pub
async
fn
validate_health
(
url
:
&
str
,
timeout_secs
:
u64
)
->
WorkerResult
<
()
>
{
use
std
::
time
::
Instant
;
let
start_time
=
Instant
::
now
();
let
start_time
=
Instant
::
now
();
let
timeout
=
std
::
time
::
Duration
::
from_secs
(
timeout_secs
);
let
timeout
=
Duration
::
from_secs
(
timeout_secs
);
loop
{
loop
{
if
start_time
.elapsed
()
>
timeout
{
if
start_time
.elapsed
()
>
timeout
{
...
@@ -775,7 +784,7 @@ impl WorkerFactory {
...
@@ -775,7 +784,7 @@ impl WorkerFactory {
// API key authentication is handled in the worker instance's check_health_async method
// API key authentication is handled in the worker instance's check_health_async method
match
WORKER_CLIENT
match
WORKER_CLIENT
.get
(
format!
(
"{}/health"
,
url
))
.get
(
format!
(
"{}/health"
,
url
))
.timeout
(
std
::
time
::
Duration
::
from_secs
(
5
))
.timeout
(
Duration
::
from_secs
(
5
))
.send
()
.send
()
.await
.await
{
{
...
@@ -795,7 +804,7 @@ impl WorkerFactory {
...
@@ -795,7 +804,7 @@ impl WorkerFactory {
}
}
}
}
tokio
::
time
::
sleep
(
std
::
time
::
Duration
::
from_secs
(
1
))
.await
;
tokio
::
time
::
sleep
(
Duration
::
from_secs
(
1
))
.await
;
}
}
}
}
}
}
...
@@ -891,8 +900,7 @@ pub fn start_health_checker(
...
@@ -891,8 +900,7 @@ pub fn start_health_checker(
let
shutdown_clone
=
shutdown
.clone
();
let
shutdown_clone
=
shutdown
.clone
();
let
handle
=
tokio
::
spawn
(
async
move
{
let
handle
=
tokio
::
spawn
(
async
move
{
let
mut
interval
=
let
mut
interval
=
tokio
::
time
::
interval
(
Duration
::
from_secs
(
check_interval_secs
));
tokio
::
time
::
interval
(
tokio
::
time
::
Duration
::
from_secs
(
check_interval_secs
));
// Counter for periodic load reset (every 10 health check cycles)
// Counter for periodic load reset (every 10 health check cycles)
let
mut
check_count
=
0u64
;
let
mut
check_count
=
0u64
;
...
...
sgl-router/src/core/worker_manager.rs
View file @
fde9b963
...
@@ -953,113 +953,105 @@ impl WorkerManager {
...
@@ -953,113 +953,105 @@ impl WorkerManager {
return
Ok
(());
return
Ok
(());
}
}
// Mark all workers as unhealthy initially
info!
(
info!
(
"Marking {} workers as unhealthy before
initial
health checks"
,
"Marking {} workers as unhealthy before health checks"
,
workers
.len
()
workers
.len
()
);
);
for
worker
in
&
workers
{
for
worker
in
&
workers
{
worker
.set_healthy
(
false
);
worker
.set_healthy
(
false
);
}
}
info!
(
"Performing initial health checks for {} workers"
,
workers
.len
()
);
let
health_check_futures
:
Vec
<
_
>
=
workers
.iter
()
.map
(|
worker
|
{
let
w
=
worker
.clone
();
let
url
=
worker
.url
()
.to_string
();
async
move
{
match
w
.check_health_async
()
.await
{
Ok
(
_
)
=>
{
w
.set_healthy
(
true
);
debug!
(
"Worker {} passed initial health check and marked healthy"
,
url
);
Ok
(
url
)
}
Err
(
e
)
=>
{
warn!
(
"Worker {} failed initial health check: {}"
,
url
,
e
);
Err
(
url
)
}
}
}
})
.collect
();
let
health_results
=
future
::
join_all
(
health_check_futures
)
.await
;
let
failed_checks
:
Vec
<
_
>
=
health_results
.into_iter
()
.filter_map
(|
r
|
r
.err
())
.collect
();
if
!
failed_checks
.is_empty
()
{
info!
(
"Initial health check: {} workers failed: {:?}"
,
failed_checks
.len
(),
failed_checks
);
}
loop
{
loop
{
// 1. Filter unhealthy workers
let
workers
=
registry
.get_all
();
let
workers
=
registry
.get_all
();
let
healthy_workers
:
Vec
<
_
>
=
workers
.iter
()
.filter
(|
w
|
w
.is_healthy
())
.map
(|
w
|
w
.url
()
.to_string
())
.collect
();
let
unhealthy_workers
:
Vec
<
_
>
=
workers
let
unhealthy_workers
:
Vec
<
_
>
=
workers
.iter
()
.iter
()
.filter
(|
w
|
!
w
.is_healthy
())
.filter
(|
w
|
!
w
.is_healthy
())
.
map
(|
w
|
w
.url
()
.to_string
()
)
.
cloned
()
.collect
();
.collect
();
// 2. If all workers are healthy, return immediately
if
unhealthy_workers
.is_empty
()
{
if
unhealthy_workers
.is_empty
()
{
let
healthy_urls
:
Vec
<
_
>
=
workers
.iter
()
.map
(|
w
|
w
.url
()
.to_string
())
.collect
();
info!
(
info!
(
"All {} workers are healthy: {:?}"
,
"All {} workers are healthy: {:?}"
,
workers
.len
(),
workers
.len
(),
healthy_
worker
s
healthy_
url
s
);
);
return
Ok
(());
return
Ok
(());
}
}
// Check timeout
if
start_time
.elapsed
()
>
timeout
{
if
start_time
.elapsed
()
>
timeout
{
let
healthy_workers
:
Vec
<
_
>
=
workers
.iter
()
.filter
(|
w
|
w
.is_healthy
())
.map
(|
w
|
w
.url
()
.to_string
())
.collect
();
let
unhealthy_urls
:
Vec
<
_
>
=
unhealthy_workers
.iter
()
.map
(|
w
|
w
.url
()
.to_string
())
.collect
();
error!
(
error!
(
"Workers failed to become healthy after {}s. Unhealthy: {:?}, Healthy: {:?}"
,
"Workers failed to become healthy after {}s. Unhealthy: {:?}, Healthy: {:?}"
,
timeout_secs
,
unhealthy_
worker
s
,
healthy_workers
timeout_secs
,
unhealthy_
url
s
,
healthy_workers
);
);
return
Err
(
format!
(
return
Err
(
format!
(
"Workers failed to become healthy after {}s. Unhealthy: {:?}"
,
"Workers failed to become healthy after {}s. Unhealthy: {:?}"
,
timeout_secs
,
unhealthy_
worker
s
timeout_secs
,
unhealthy_
url
s
));
));
}
}
let
unhealthy_urls
:
Vec
<
_
>
=
unhealthy_workers
.iter
()
.map
(|
w
|
w
.url
()
.to_string
())
.collect
();
info!
(
info!
(
"Waiting for {} workers to become healthy. Unhealthy: {:?}"
,
"Waiting for {} workers to become healthy. Unhealthy: {:?}"
,
unhealthy_workers
.len
(),
unhealthy_workers
.len
(),
unhealthy_
worker
s
unhealthy_
url
s
);
);
let
unhealthy_workers_to_check
=
workers
// 3. Check health of all unhealthy workers in parallel
let
health_check_futures
:
Vec
<
_
>
=
unhealthy_workers
.iter
()
.iter
()
.filter
(|
w
|
!
w
.is_healthy
())
.map
(|
worker
|
{
.cloned
()
let
w
=
worker
.clone
();
.collect
::
<
Vec
<
_
>>
();
for
worker
in
unhealthy_workers_to_check
{
let
url
=
worker
.url
()
.to_string
();
let
url
=
worker
.url
()
.to_string
();
match
worker
.check_health_async
()
.await
{
async
move
{
match
w
.check_health_async
()
.await
{
Ok
(
_
)
=>
{
Ok
(
_
)
=>
{
if
!
worker
.is_healthy
()
{
w
.set_healthy
(
true
);
worker
.set_healthy
(
true
);
debug!
(
"Worker {} now healthy"
,
url
);
debug!
(
"Worker {} now healthy after health check"
,
url
);
}
}
}
Err
(
e
)
=>
{
Err
(
e
)
=>
{
debug!
(
"Worker {} health check failed: {}"
,
url
,
e
);
debug!
(
"Worker {} health check failed: {}"
,
url
,
e
);
}
}
}
}
}
}
})
.collect
();
future
::
join_all
(
health_check_futures
)
.await
;
// 4. Check if all workers are now healthy after health checks
let
still_unhealthy
:
Vec
<
_
>
=
workers
.iter
()
.filter
(|
w
|
!
w
.is_healthy
())
.collect
();
// 5. If all workers are now healthy, return immediately without sleeping
if
still_unhealthy
.is_empty
()
{
let
healthy_urls
:
Vec
<
_
>
=
workers
.iter
()
.map
(|
w
|
w
.url
()
.to_string
())
.collect
();
info!
(
"All {} workers are healthy: {:?}"
,
workers
.len
(),
healthy_urls
);
return
Ok
(());
}
// 6. Otherwise, sleep before next iteration
tokio
::
time
::
sleep
(
check_interval
)
.await
;
tokio
::
time
::
sleep
(
check_interval
)
.await
;
}
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment