Unverified Commit fde9b963 authored by Simo Lin's avatar Simo Lin Committed by GitHub
Browse files

[router] cleanup worker health check to return early (#11310)

parent f094e0a4
...@@ -9,11 +9,14 @@ use serde_json; ...@@ -9,11 +9,14 @@ use serde_json;
use std::fmt; use std::fmt;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::{Arc, LazyLock}; use std::sync::{Arc, LazyLock};
use std::time::Duration;
use std::time::Instant;
use tokio::sync::{Mutex, RwLock}; use tokio::sync::{Mutex, RwLock};
use tokio::time;
static WORKER_CLIENT: LazyLock<reqwest::Client> = LazyLock::new(|| { static WORKER_CLIENT: LazyLock<reqwest::Client> = LazyLock::new(|| {
reqwest::Client::builder() reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(30)) .timeout(Duration::from_secs(30))
.build() .build()
.expect("Failed to create worker HTTP client") .expect("Failed to create worker HTTP client")
}); });
...@@ -227,6 +230,8 @@ pub trait Worker: Send + Sync + fmt::Debug { ...@@ -227,6 +230,8 @@ pub trait Worker: Send + Sync + fmt::Debug {
async fn reset_grpc_client(&self) -> WorkerResult<()> { async fn reset_grpc_client(&self) -> WorkerResult<()> {
Ok(()) Ok(())
} }
async fn grpc_health_check(&self) -> WorkerResult<bool>;
async fn http_health_check(&self) -> WorkerResult<bool>;
} }
/// Connection mode for worker communication /// Connection mode for worker communication
...@@ -407,66 +412,9 @@ impl Worker for BasicWorker { ...@@ -407,66 +412,9 @@ impl Worker for BasicWorker {
} }
async fn check_health_async(&self) -> WorkerResult<()> { async fn check_health_async(&self) -> WorkerResult<()> {
use std::time::Duration;
let health_result = match &self.metadata.connection_mode { let health_result = match &self.metadata.connection_mode {
ConnectionMode::Http => { ConnectionMode::Http => self.http_health_check().await?,
let url = self.normalised_url()?; ConnectionMode::Grpc { .. } => self.grpc_health_check().await?,
let health_url = format!("{}{}", url, self.metadata.health_config.endpoint);
let timeout = Duration::from_secs(self.metadata.health_config.timeout_secs);
let mut request = WORKER_CLIENT.get(&health_url).timeout(timeout);
if let Some(ref api_key) = self.metadata.api_key {
request = request.header("Authorization", format!("Bearer {}", api_key));
}
match request.send().await {
Ok(response) => response.status().is_success(),
Err(_) => false,
}
}
ConnectionMode::Grpc { .. } => {
// Use the new get_grpc_client() method for lazy initialization
match self.get_grpc_client().await {
Ok(Some(grpc_client)) => {
let mut client = grpc_client.lock().await;
match client.health_check().await {
Ok(response) => {
tracing::debug!(
"gRPC health check succeeded for {}: healthy={}",
self.metadata.url,
response.healthy
);
response.healthy
}
Err(e) => {
tracing::warn!(
"gRPC health check RPC failed for {}: {:?}",
self.metadata.url,
e
);
false
}
}
}
Ok(None) => {
tracing::error!(
"Worker {} is not a gRPC worker but has gRPC connection mode",
self.metadata.url
);
false
}
Err(e) => {
tracing::error!(
"Failed to get gRPC client for worker {}: {:?}",
self.metadata.url,
e
);
false
}
}
}
}; };
if health_result { if health_result {
...@@ -594,6 +542,61 @@ impl Worker for BasicWorker { ...@@ -594,6 +542,61 @@ impl Worker for BasicWorker {
} }
} }
} }
async fn grpc_health_check(&self) -> WorkerResult<bool> {
let timeout = Duration::from_secs(self.metadata.health_config.timeout_secs);
let maybe = self.get_grpc_client().await?;
let Some(grpc_client) = maybe else {
tracing::error!(
"Worker {} is not a gRPC worker but connection mode is gRPC",
self.metadata.url
);
return Ok(false);
};
let mut client = grpc_client.lock().await;
match time::timeout(timeout, client.health_check()).await {
Ok(Ok(resp)) => {
tracing::debug!(
"gRPC health OK for {}: healthy={}",
self.metadata.url,
resp.healthy
);
Ok(resp.healthy)
}
Ok(Err(err)) => {
tracing::warn!("gRPC health RPC error for {}: {err:?}", self.metadata.url);
Ok(false)
}
Err(_) => {
tracing::warn!("gRPC health timed out for {}", self.metadata.url);
Ok(false)
}
}
}
async fn http_health_check(&self) -> WorkerResult<bool> {
let timeout = Duration::from_secs(self.metadata.health_config.timeout_secs);
let url = self.normalised_url()?;
let health_url = format!("{}{}", url, self.metadata.health_config.endpoint);
let mut req = WORKER_CLIENT.get(health_url).timeout(timeout);
if let Some(api_key) = &self.metadata.api_key {
req = req.bearer_auth(api_key);
}
match req.send().await {
Ok(resp) => Ok(resp.status().is_success()),
Err(err) => {
tracing::warn!(
"HTTP health check failed for {}: {err:?}",
self.metadata.url
);
Ok(false)
}
}
}
} }
/// A DP-aware worker that handles data-parallel routing /// A DP-aware worker that handles data-parallel routing
...@@ -730,6 +733,14 @@ impl Worker for DPAwareWorker { ...@@ -730,6 +733,14 @@ impl Worker for DPAwareWorker {
async fn reset_grpc_client(&self) -> WorkerResult<()> { async fn reset_grpc_client(&self) -> WorkerResult<()> {
self.base_worker.reset_grpc_client().await self.base_worker.reset_grpc_client().await
} }
async fn grpc_health_check(&self) -> WorkerResult<bool> {
self.base_worker.grpc_health_check().await
}
async fn http_health_check(&self) -> WorkerResult<bool> {
self.base_worker.http_health_check().await
}
} }
/// Worker factory for creating workers of different types /// Worker factory for creating workers of different types
...@@ -755,10 +766,8 @@ impl WorkerFactory { ...@@ -755,10 +766,8 @@ impl WorkerFactory {
/// Static health validation before creating a worker /// Static health validation before creating a worker
/// This replaces wait_for_worker_health in handlers /// This replaces wait_for_worker_health in handlers
pub async fn validate_health(url: &str, timeout_secs: u64) -> WorkerResult<()> { pub async fn validate_health(url: &str, timeout_secs: u64) -> WorkerResult<()> {
use std::time::Instant;
let start_time = Instant::now(); let start_time = Instant::now();
let timeout = std::time::Duration::from_secs(timeout_secs); let timeout = Duration::from_secs(timeout_secs);
loop { loop {
if start_time.elapsed() > timeout { if start_time.elapsed() > timeout {
...@@ -775,7 +784,7 @@ impl WorkerFactory { ...@@ -775,7 +784,7 @@ impl WorkerFactory {
// API key authentication is handled in the worker instance's check_health_async method // API key authentication is handled in the worker instance's check_health_async method
match WORKER_CLIENT match WORKER_CLIENT
.get(format!("{}/health", url)) .get(format!("{}/health", url))
.timeout(std::time::Duration::from_secs(5)) .timeout(Duration::from_secs(5))
.send() .send()
.await .await
{ {
...@@ -795,7 +804,7 @@ impl WorkerFactory { ...@@ -795,7 +804,7 @@ impl WorkerFactory {
} }
} }
tokio::time::sleep(std::time::Duration::from_secs(1)).await; tokio::time::sleep(Duration::from_secs(1)).await;
} }
} }
} }
...@@ -891,8 +900,7 @@ pub fn start_health_checker( ...@@ -891,8 +900,7 @@ pub fn start_health_checker(
let shutdown_clone = shutdown.clone(); let shutdown_clone = shutdown.clone();
let handle = tokio::spawn(async move { let handle = tokio::spawn(async move {
let mut interval = let mut interval = tokio::time::interval(Duration::from_secs(check_interval_secs));
tokio::time::interval(tokio::time::Duration::from_secs(check_interval_secs));
// Counter for periodic load reset (every 10 health check cycles) // Counter for periodic load reset (every 10 health check cycles)
let mut check_count = 0u64; let mut check_count = 0u64;
......
...@@ -953,113 +953,105 @@ impl WorkerManager { ...@@ -953,113 +953,105 @@ impl WorkerManager {
return Ok(()); return Ok(());
} }
// Mark all workers as unhealthy initially
info!( info!(
"Marking {} workers as unhealthy before initial health checks", "Marking {} workers as unhealthy before health checks",
workers.len() workers.len()
); );
for worker in &workers { for worker in &workers {
worker.set_healthy(false); worker.set_healthy(false);
} }
info!(
"Performing initial health checks for {} workers",
workers.len()
);
let health_check_futures: Vec<_> = workers
.iter()
.map(|worker| {
let w = worker.clone();
let url = worker.url().to_string();
async move {
match w.check_health_async().await {
Ok(_) => {
w.set_healthy(true);
debug!(
"Worker {} passed initial health check and marked healthy",
url
);
Ok(url)
}
Err(e) => {
warn!("Worker {} failed initial health check: {}", url, e);
Err(url)
}
}
}
})
.collect();
let health_results = future::join_all(health_check_futures).await;
let failed_checks: Vec<_> = health_results.into_iter().filter_map(|r| r.err()).collect();
if !failed_checks.is_empty() {
info!(
"Initial health check: {} workers failed: {:?}",
failed_checks.len(),
failed_checks
);
}
loop { loop {
// 1. Filter unhealthy workers
let workers = registry.get_all(); let workers = registry.get_all();
let healthy_workers: Vec<_> = workers
.iter()
.filter(|w| w.is_healthy())
.map(|w| w.url().to_string())
.collect();
let unhealthy_workers: Vec<_> = workers let unhealthy_workers: Vec<_> = workers
.iter() .iter()
.filter(|w| !w.is_healthy()) .filter(|w| !w.is_healthy())
.map(|w| w.url().to_string()) .cloned()
.collect(); .collect();
// 2. If all workers are healthy, return immediately
if unhealthy_workers.is_empty() { if unhealthy_workers.is_empty() {
let healthy_urls: Vec<_> = workers.iter().map(|w| w.url().to_string()).collect();
info!( info!(
"All {} workers are healthy: {:?}", "All {} workers are healthy: {:?}",
workers.len(), workers.len(),
healthy_workers healthy_urls
); );
return Ok(()); return Ok(());
} }
// Check timeout
if start_time.elapsed() > timeout { if start_time.elapsed() > timeout {
let healthy_workers: Vec<_> = workers
.iter()
.filter(|w| w.is_healthy())
.map(|w| w.url().to_string())
.collect();
let unhealthy_urls: Vec<_> = unhealthy_workers
.iter()
.map(|w| w.url().to_string())
.collect();
error!( error!(
"Workers failed to become healthy after {}s. Unhealthy: {:?}, Healthy: {:?}", "Workers failed to become healthy after {}s. Unhealthy: {:?}, Healthy: {:?}",
timeout_secs, unhealthy_workers, healthy_workers timeout_secs, unhealthy_urls, healthy_workers
); );
return Err(format!( return Err(format!(
"Workers failed to become healthy after {}s. Unhealthy: {:?}", "Workers failed to become healthy after {}s. Unhealthy: {:?}",
timeout_secs, unhealthy_workers timeout_secs, unhealthy_urls
)); ));
} }
let unhealthy_urls: Vec<_> = unhealthy_workers
.iter()
.map(|w| w.url().to_string())
.collect();
info!( info!(
"Waiting for {} workers to become healthy. Unhealthy: {:?}", "Waiting for {} workers to become healthy. Unhealthy: {:?}",
unhealthy_workers.len(), unhealthy_workers.len(),
unhealthy_workers unhealthy_urls
); );
let unhealthy_workers_to_check = workers // 3. Check health of all unhealthy workers in parallel
let health_check_futures: Vec<_> = unhealthy_workers
.iter() .iter()
.filter(|w| !w.is_healthy()) .map(|worker| {
.cloned() let w = worker.clone();
.collect::<Vec<_>>();
for worker in unhealthy_workers_to_check {
let url = worker.url().to_string(); let url = worker.url().to_string();
match worker.check_health_async().await { async move {
match w.check_health_async().await {
Ok(_) => { Ok(_) => {
if !worker.is_healthy() { w.set_healthy(true);
worker.set_healthy(true); debug!("Worker {} now healthy", url);
debug!("Worker {} now healthy after health check", url);
}
} }
Err(e) => { Err(e) => {
debug!("Worker {} health check failed: {}", url, e); debug!("Worker {} health check failed: {}", url, e);
} }
} }
} }
})
.collect();
future::join_all(health_check_futures).await;
// 4. Check if all workers are now healthy after health checks
let still_unhealthy: Vec<_> = workers.iter().filter(|w| !w.is_healthy()).collect();
// 5. If all workers are now healthy, return immediately without sleeping
if still_unhealthy.is_empty() {
let healthy_urls: Vec<_> = workers.iter().map(|w| w.url().to_string()).collect();
info!(
"All {} workers are healthy: {:?}",
workers.len(),
healthy_urls
);
return Ok(());
}
// 6. Otherwise, sleep before next iteration
tokio::time::sleep(check_interval).await; tokio::time::sleep(check_interval).await;
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment