[Misc] add service discovery for sgl router

1468769b · Simo Lin · GitHub · 91dda4cd · 1468769b · 1468769b
Unverified Commit 1468769b authored Apr 29, 2025 by Simo Lin Committed by GitHub Apr 29, 2025
11 changed files
--- a/sgl-router/Cargo.lock
+++ b/sgl-router/Cargo.lock
--- a/sgl-router/Cargo.toml
+++ b/sgl-router/Cargo.toml
@@ -30,7 +30,9 @@ tracing = "0.1"
 tracing-subscriber = { version = "0.3", features = ["env-filter", "json", "chrono"] }
 tracing-log = "0.2"
 tracing-appender = "0.2.3"
-
+kube = { version = "0.88.1", features = ["runtime", "derive"] }
+k8s-openapi = { version = "0.21.0", features = ["v1_29"] }
+futures = "0.3"
 [profile.release]
 lto = "thin"
 codegen-units = 1
--- a/sgl-router/README.md
+++ b/sgl-router/README.md
@@ -81,6 +81,41 @@ router = Router(

 Use the `--verbose` flag with the CLI for more detailed logs.

+### Kubernetes Service Discovery
+
+SGL Router supports automatic service discovery for worker nodes in Kubernetes environments. When enabled, the router will automatically:
+
+- Discover and add worker pods with matching labels
+- Remove unhealthy or deleted worker pods
+- Dynamically adjust the worker pool based on pod health and availability
+
+#### Command Line Usage
+
+```bash
+python -m sglang_router.launch_router \
+    --service-discovery \
+    --selector app=sglang-worker role=inference \
+    --service-discovery-port 8000 \
+    --service-discovery-namespace default
+```
+
+#### Service Discovery Arguments
+
+- `--service-discovery`: Enable Kubernetes service discovery feature
+- `--selector`: One or more label key-value pairs for pod selection (format: key1=value1 key2=value2)
+- `--service-discovery-port`: Port to use when generating worker URLs (default: 80)
+- `--service-discovery-namespace`: Optional. Kubernetes namespace to watch for pods. If not provided, watches all namespaces (requires cluster-wide permissions)
+
+#### RBAC Requirements
+
+When using service discovery, you must configure proper Kubernetes RBAC permissions:
+
+- **If using namespace-scoped discovery** (with `--service-discovery-namespace`):
+  Set up a ServiceAccount, Role, and RoleBinding
+
+- **If watching all namespaces** (without specifying namespace):
+  Set up a ServiceAccount, ClusterRole, and ClusterRoleBinding with permissions to list/watch pods at the cluster level
+
 ### Troubleshooting

 1. If rust analyzer is not working in VSCode, set `rust-analyzer.linkedProjects` to the absolute path of `Cargo.toml` in your repo. For example:

--- a/sgl-router/py_src/sglang_router/launch_router.py
+++ b/sgl-router/py_src/sglang_router/launch_router.py
@@ -2,7 +2,7 @@ import argparse
 import dataclasses
 import logging
 import sys
-from typing import List, Optional
+from typing import Dict, List, Optional

 from sglang_router import Router
 from sglang_router_rs import PolicyType
@@ -43,6 +43,11 @@ class RouterArgs:
    max_payload_size: int = 4 * 1024 * 1024  # 4MB
    verbose: bool = False
    log_dir: Optional[str] = None
+    # Service discovery configuration
+    service_discovery: bool = False
+    selector: Dict[str, str] = dataclasses.field(default_factory=dict)
+    service_discovery_port: int = 80
+    service_discovery_namespace: Optional[str] = None

    @staticmethod
    def add_cli_args(
@@ -149,6 +154,28 @@ class RouterArgs:
            default=None,
            help="Directory to store log files. If not specified, logs are only output to console.",
        )
+        parser.add_argument(
+            f"--{prefix}service-discovery",
+            action="store_true",
+            help="Enable Kubernetes service discovery",
+        )
+        parser.add_argument(
+            f"--{prefix}selector",
+            type=str,
+            nargs="+",
+            help="Label selector for Kubernetes service discovery (format: key1=value1 key2=value2)",
+        )
+        parser.add_argument(
+            f"--{prefix}service-discovery-port",
+            type=int,
+            default=RouterArgs.service_discovery_port,
+            help="Port to use for discovered worker pods",
+        )
+        parser.add_argument(
+            f"--{prefix}service-discovery-namespace",
+            type=str,
+            help="Kubernetes namespace to watch for pods. If not provided, watches all namespaces (requires cluster-wide permissions)",
+        )

    @classmethod
    def from_cli_args(
@@ -182,8 +209,26 @@ class RouterArgs:
            max_payload_size=getattr(args, f"{prefix}max_payload_size"),
            verbose=getattr(args, f"{prefix}verbose", False),
            log_dir=getattr(args, f"{prefix}log_dir", None),
+            service_discovery=getattr(args, f"{prefix}service_discovery", False),
+            selector=cls._parse_selector(getattr(args, f"{prefix}selector", None)),
+            service_discovery_port=getattr(args, f"{prefix}service_discovery_port"),
+            service_discovery_namespace=getattr(
+                args, f"{prefix}service_discovery_namespace", None
+            ),
        )

+    @staticmethod
+    def _parse_selector(selector_list):
+        if not selector_list:
+            return {}
+
+        selector = {}
+        for item in selector_list:
+            if "=" in item:
+                key, value = item.split("=", 1)
+                selector[key] = value
+        return selector
+

 def policy_from_str(policy_str: str) -> PolicyType:
    """Convert policy string to PolicyType enum."""
@@ -229,6 +274,10 @@ def launch_router(args: argparse.Namespace) -> Optional[Router]:
            max_payload_size=router_args.max_payload_size,
            verbose=router_args.verbose,
            log_dir=router_args.log_dir,
+            service_discovery=router_args.service_discovery,
+            selector=router_args.selector,
+            service_discovery_port=router_args.service_discovery_port,
+            service_discovery_namespace=router_args.service_discovery_namespace,
        )

        router.start()

--- a/sgl-router/py_src/sglang_router/router.py
+++ b/sgl-router/py_src/sglang_router/router.py
-from typing import List, Optional
+from typing import Dict, List, Optional

 from sglang_router_rs import PolicyType
 from sglang_router_rs import Router as _Router
@@ -32,6 +32,14 @@ class Router:
        max_tree_size: Maximum size of the approximation tree for cache-aware routing. Default: 2^24
        verbose: Enable verbose logging. Default: False
        log_dir: Directory to store log files. If None, logs are only output to console. Default: None
+        service_discovery: Enable Kubernetes service discovery. When enabled, the router will
+            automatically discover worker pods based on the selector. Default: False
+        selector: Dictionary mapping of label keys to values for Kubernetes pod selection.
+            Example: {"app": "sglang-worker"}. Default: {}
+        service_discovery_port: Port to use for service discovery. The router will generate
+            worker URLs using this port. Default: 80
+        service_discovery_namespace: Kubernetes namespace to watch for pods. If not provided,
+            watches pods across all namespaces (requires cluster-wide permissions). Default: None
    """

    def __init__(
@@ -50,7 +58,14 @@ class Router:
        max_payload_size: int = 4 * 1024 * 1024,  # 4MB
        verbose: bool = False,
        log_dir: Optional[str] = None,
+        service_discovery: bool = False,
+        selector: Dict[str, str] = None,
+        service_discovery_port: int = 80,
+        service_discovery_namespace: Optional[str] = None,
    ):
+        if selector is None:
+            selector = {}
+
        self._router = _Router(
            worker_urls=worker_urls,
            policy=policy,
@@ -66,6 +81,10 @@ class Router:
            max_payload_size=max_payload_size,
            verbose=verbose,
            log_dir=log_dir,
+            service_discovery=service_discovery,
+            selector=selector,
+            service_discovery_port=service_discovery_port,
+            service_discovery_namespace=service_discovery_namespace,
        )

    def start(self) -> None:

--- a/sgl-router/py_test/test_launch_router.py
+++ b/sgl-router/py_test/test_launch_router.py
@@ -38,6 +38,10 @@ class TestLaunchRouter(unittest.TestCase):
            max_payload_size=4 * 1024 * 1024,  # 4MB
            verbose=False,
            log_dir=None,
+            service_discovery=False,
+            selector=None,
+            service_discovery_port=80,
+            service_discovery_namespace=None,
        )

    def create_router_args(self, **kwargs):
@@ -79,6 +83,23 @@ class TestLaunchRouter(unittest.TestCase):
        args = self.create_router_args(worker_urls=[])
        self.run_router_process(args)

+    def test_launch_router_with_service_discovery(self):
+        # Test router startup with service discovery enabled but no selectors
+        args = self.create_router_args(
+            worker_urls=[], service_discovery=True, selector=["app=test-worker"]
+        )
+        self.run_router_process(args)
+
+    def test_launch_router_with_service_discovery_namespace(self):
+        # Test router startup with service discovery enabled and namespace specified
+        args = self.create_router_args(
+            worker_urls=[],
+            service_discovery=True,
+            selector=["app=test-worker"],
+            service_discovery_namespace="test-namespace",
+        )
+        self.run_router_process(args)
+

 if __name__ == "__main__":
    unittest.main()
--- a/sgl-router/py_test/test_launch_server.py
+++ b/sgl-router/py_test/test_launch_server.py
@@ -24,6 +24,10 @@ def popen_launch_router(
    max_payload_size: int = None,
    api_key: str = None,
    log_dir: str = None,
+    service_discovery: bool = False,
+    selector: list = None,
+    service_discovery_port: int = 80,
+    service_discovery_namespace: str = None,
 ):
    """
    Launch the router server process.
@@ -37,6 +41,10 @@ def popen_launch_router(
        max_payload_size: Maximum payload size in bytes
        api_key: API key for the router
        log_dir: Directory to store log files. If None, logs are only output to console.
+        service_discovery: Enable Kubernetes service discovery
+        selector: List of label selectors in format ["key1=value1", "key2=value2"]
+        service_discovery_port: Port to use for service discovery
+        service_discovery_namespace: Kubernetes namespace to watch for pods. If None, watches all namespaces.
    """
    _, host, port = base_url.split(":")
    host = host[2:]
@@ -65,6 +73,20 @@ def popen_launch_router(
    if max_payload_size is not None:
        command.extend(["--router-max-payload-size", str(max_payload_size)])

+    if service_discovery:
+        command.append("--router-service-discovery")
+
+    if selector:
+        command.extend(["--router-selector"] + selector)
+
+    if service_discovery_port != 80:
+        command.extend(["--router-service-discovery-port", str(service_discovery_port)])
+
+    if service_discovery_namespace:
+        command.extend(
+            ["--router-service-discovery-namespace", service_discovery_namespace]
+        )
+
    if log_dir is not None:
        command.extend(["--log-dir", log_dir])


--- a/sgl-router/src/lib.rs
+++ b/sgl-router/src/lib.rs
 use pyo3::prelude::*;
 pub mod logging;
+use std::collections::HashMap;
 pub mod router;
 pub mod server;
+pub mod service_discovery;
 pub mod tree;

 #[pyclass(eq)]
@@ -29,6 +31,10 @@ struct Router {
    max_payload_size: usize,
    verbose: bool,
    log_dir: Option<String>,
+    service_discovery: bool,
+    selector: HashMap<String, String>,
+    service_discovery_port: u16,
+    service_discovery_namespace: Option<String>,
 }

 #[pymethods]
@@ -49,6 +55,10 @@ impl Router {
        max_payload_size = 4 * 1024 * 1024,
        verbose = false,
        log_dir = None,
+        service_discovery = false,
+        selector = HashMap::new(),
+        service_discovery_port = 80,
+        service_discovery_namespace = None
    ))]
    fn new(
        worker_urls: Vec<String>,
@@ -65,6 +75,10 @@ impl Router {
        max_payload_size: usize,
        verbose: bool,
        log_dir: Option<String>,
+        service_discovery: bool,
+        selector: HashMap<String, String>,
+        service_discovery_port: u16,
+        service_discovery_namespace: Option<String>,
    ) -> PyResult<Self> {
        Ok(Router {
            host,
@@ -81,6 +95,10 @@ impl Router {
            max_payload_size,
            verbose,
            log_dir,
+            service_discovery,
+            selector,
+            service_discovery_port,
+            service_discovery_namespace,
        })
    }

@@ -105,6 +123,19 @@ impl Router {
            },
        };

+        // Create service discovery config if enabled
+        let service_discovery_config = if self.service_discovery {
+            Some(service_discovery::ServiceDiscoveryConfig {
+                enabled: true,
+                selector: self.selector.clone(),
+                check_interval: std::time::Duration::from_secs(60),
+                port: self.service_discovery_port,
+                namespace: self.service_discovery_namespace.clone(),
+            })
+        } else {
+            None
+        };
+
        actix_web::rt::System::new().block_on(async move {
            server::startup(server::ServerConfig {
                host: self.host.clone(),
@@ -114,6 +145,7 @@ impl Router {
                verbose: self.verbose,
                max_payload_size: self.max_payload_size,
                log_dir: self.log_dir.clone(),
+                service_discovery_config,
            })
            .await
            .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(e.to_string()))?;

--- a/sgl-router/src/router.rs
+++ b/sgl-router/src/router.rs
@@ -240,6 +240,15 @@ impl Router {
        })
    }

+    /// Get a reference to the worker URLs shared across threads
+    pub fn get_worker_urls(&self) -> Arc<RwLock<Vec<String>>> {
+        match self {
+            Router::RoundRobin { worker_urls, .. } => Arc::clone(worker_urls),
+            Router::Random { worker_urls, .. } => Arc::clone(worker_urls),
+            Router::CacheAware { worker_urls, .. } => Arc::clone(worker_urls),
+        }
+    }
+
    fn wait_for_healthy_workers(
        worker_urls: &[String],
        timeout_secs: u64,

--- a/sgl-router/src/server.rs
+++ b/sgl-router/src/server.rs
 use crate::logging::{self, LoggingConfig};
 use crate::router::PolicyConfig;
 use crate::router::Router;
+use crate::service_discovery::{start_service_discovery, ServiceDiscoveryConfig};
 use actix_web::{
    error, get, post, web, App, Error, HttpRequest, HttpResponse, HttpServer, Responder,
 };
 use bytes::Bytes;
 use futures_util::StreamExt;
+use reqwest::Client;
 use std::collections::HashMap;
 use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::Arc;
 use std::time::Duration;
-use tracing::{info, Level};
+use tokio::spawn;
+use tracing::{error, info, warn, Level};

 #[derive(Debug)]
 pub struct AppState {
    router: Router,
-    client: reqwest::Client,
+    client: Client,
 }

 impl AppState {
    pub fn new(
        worker_urls: Vec<String>,
-        client: reqwest::Client,
+        client: Client,
        policy_config: PolicyConfig,
    ) -> Result<Self, String> {
        // Create router based on policy
@@ -149,6 +153,7 @@ pub struct ServerConfig {
    pub verbose: bool,
    pub max_payload_size: usize,
    pub log_dir: Option<String>,
+    pub service_discovery_config: Option<ServiceDiscoveryConfig>,
 }

 pub async fn startup(config: ServerConfig) -> std::io::Result<()> {
@@ -180,7 +185,15 @@ pub async fn startup(config: ServerConfig) -> std::io::Result<()> {
        config.max_payload_size / (1024 * 1024)
    );

-    let client = reqwest::Client::builder()
+    // Log service discovery status
+    if let Some(service_discovery_config) = &config.service_discovery_config {
+        info!("🚧 Service discovery enabled");
+        info!("🚧 Selector: {:?}", service_discovery_config.selector);
+    } else {
+        info!("🚧 Service discovery disabled");
+    }
+
+    let client = Client::builder()
        .pool_idle_timeout(Some(Duration::from_secs(50)))
        .build()
        .expect("Failed to create HTTP client");
@@ -194,6 +207,30 @@ pub async fn startup(config: ServerConfig) -> std::io::Result<()> {
        .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?,
    );

+    // Start the service discovery if enabled
+    if let Some(service_discovery_config) = config.service_discovery_config {
+        if service_discovery_config.enabled {
+            let worker_urls = Arc::clone(&app_state.router.get_worker_urls());
+
+            match start_service_discovery(service_discovery_config, worker_urls).await {
+                Ok(handle) => {
+                    info!("✅ Service discovery started successfully");
+
+                    // Spawn a task to handle the service discovery thread
+                    spawn(async move {
+                        if let Err(e) = handle.await {
+                            error!("Service discovery task failed: {:?}", e);
+                        }
+                    });
+                }
+                Err(e) => {
+                    error!("Failed to start service discovery: {}", e);
+                    warn!("Continuing without service discovery");
+                }
+            }
+        }
+    }
+
    info!("✅ Serving router on {}:{}", config.host, config.port);
    info!("✅ Serving workers on {:?}", config.worker_urls);


--- a/sgl-router/src/service_discovery.rs
+++ b/sgl-router/src/service_discovery.rs
+use futures::{StreamExt, TryStreamExt};
+use k8s_openapi::api::core::v1::Pod;
+use kube::{
+    api::Api,
+    runtime::watcher::{watcher, Config},
+    runtime::WatchStreamExt,
+    Client,
+};
+use log::{error, info, warn};
+use std::collections::{HashMap, HashSet};
+use std::sync::{Arc, Mutex, RwLock};
+use std::time::Duration;
+use tokio::task;
+use tokio::time;
+
+/// Represents the service discovery configuration
+#[derive(Debug, Clone)]
+pub struct ServiceDiscoveryConfig {
+    pub enabled: bool,
+    pub selector: HashMap<String, String>,
+    pub check_interval: Duration,
+    pub port: u16,
+    pub namespace: Option<String>,
+}
+
+impl Default for ServiceDiscoveryConfig {
+    fn default() -> Self {
+        ServiceDiscoveryConfig {
+            enabled: false,
+            selector: HashMap::new(),
+            check_interval: Duration::from_secs(60),
+            port: 80,        // Default port to connect to pods
+            namespace: None, // None means watch all namespaces
+        }
+    }
+}
+
+/// Represents a Kubernetes pod's information used for worker management
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct PodInfo {
+    pub name: String,
+    pub ip: String,
+    pub status: String,
+    pub is_ready: bool,
+}
+
+impl PodInfo {
+    pub fn from_pod(pod: &Pod) -> Option<Self> {
+        let name = pod.metadata.name.clone()?;
+        let status = pod.status.clone()?;
+        let pod_ip = status.pod_ip?;
+
+        let is_ready = if let Some(conditions) = &status.conditions {
+            conditions
+                .iter()
+                .any(|condition| condition.type_ == "Ready" && condition.status == "True")
+        } else {
+            false
+        };
+
+        let pod_status = status.phase.unwrap_or_else(|| "Unknown".to_string());
+
+        Some(PodInfo {
+            name,
+            ip: pod_ip,
+            status: pod_status,
+            is_ready,
+        })
+    }
+
+    /// Returns true if the pod is in a state where it can accept traffic
+    pub fn is_healthy(&self) -> bool {
+        self.is_ready && self.status == "Running"
+    }
+
+    /// Generates a worker URL for this pod
+    pub fn worker_url(&self, port: u16) -> String {
+        format!("http://{}:{}", self.ip, port)
+    }
+}
+
+pub async fn start_service_discovery(
+    config: ServiceDiscoveryConfig,
+    worker_urls: Arc<RwLock<Vec<String>>>,
+) -> Result<task::JoinHandle<()>, kube::Error> {
+    // Don't initialize anything if service discovery is disabled
+    if !config.enabled {
+        // Return a generic error when service discovery is disabled
+        return Err(kube::Error::Api(kube::error::ErrorResponse {
+            status: "Disabled".to_string(),
+            message: "Service discovery is disabled".to_string(),
+            reason: "ConfigurationError".to_string(),
+            code: 400,
+        }));
+    }
+
+    // Initialize Kubernetes client
+    let client = Client::try_default().await?;
+
+    // Construct label selector string from map
+    let label_selector = config
+        .selector
+        .iter()
+        .map(|(k, v)| format!("{}={}", k, v))
+        .collect::<Vec<_>>()
+        .join(",");
+
+    info!(
+        "Starting Kubernetes service discovery with selector: {}",
+        label_selector
+    );
+
+    // Create the task that will run in the background
+    let handle = task::spawn(async move {
+        // We'll track pods we've already added to avoid duplicates
+        let tracked_pods = Arc::new(Mutex::new(HashSet::new()));
+
+        // Create a watcher for pods
+        let pods: Api<Pod> = if let Some(namespace) = &config.namespace {
+            Api::namespaced(client, namespace)
+        } else {
+            Api::all(client)
+        };
+
+        info!("Kubernetes service discovery initialized successfully");
+
+        // Create an Arc for the selector map
+        let selector = Arc::new(config.selector);
+        let port = config.port;
+
+        loop {
+            // Create a watcher with the proper parameters according to the kube-rs API
+            let watcher_config = Config::default();
+            let watcher_stream = watcher(pods.clone(), watcher_config).applied_objects();
+
+            // Clone Arcs for the closures
+            let selector_clone = Arc::clone(&selector);
+            let tracked_pods_clone = Arc::clone(&tracked_pods);
+            let worker_urls_clone = Arc::clone(&worker_urls);
+
+            // Apply label selector filter separately since we can't do it directly with the watcher anymore
+            let filtered_stream = watcher_stream.filter_map(move |obj_res| {
+                let selector_inner = Arc::clone(&selector_clone);
+
+                async move {
+                    match obj_res {
+                        Ok(pod) => {
+                            // Only process pods matching our label selector
+                            if pod.metadata.labels.as_ref().map_or(false, |labels| {
+                                // Check if the pod has all the labels from our selector
+                                selector_inner.iter().all(|(k, v)| {
+                                    labels.get(k).map_or(false, |label_value| label_value == v)
+                                })
+                            }) {
+                                Some(Ok(pod))
+                            } else {
+                                None
+                            }
+                        }
+                        Err(e) => Some(Err(e)),
+                    }
+                }
+            });
+
+            // Clone again for the next closure
+            let tracked_pods_clone2 = Arc::clone(&tracked_pods_clone);
+            let worker_urls_clone2 = Arc::clone(&worker_urls_clone);
+
+            match filtered_stream
+                .try_for_each(move |pod| {
+                    let tracked_pods_inner = Arc::clone(&tracked_pods_clone2);
+                    let worker_urls_inner = Arc::clone(&worker_urls_clone2);
+
+                    async move {
+                        if let Some(pod_info) = PodInfo::from_pod(&pod) {
+                            if pod.metadata.deletion_timestamp.is_some() {
+                                handle_pod_deletion(
+                                    &pod_info,
+                                    tracked_pods_inner,
+                                    worker_urls_inner,
+                                    port,
+                                )
+                                .await;
+                            } else {
+                                handle_pod_event(
+                                    &pod_info,
+                                    tracked_pods_inner,
+                                    worker_urls_inner,
+                                    port,
+                                )
+                                .await;
+                            }
+                        }
+                        Ok(())
+                    }
+                })
+                .await
+            {
+                Ok(_) => {}
+                Err(err) => {
+                    error!("Error in Kubernetes watcher: {}", err);
+                    // Wait a bit before retrying
+                    time::sleep(Duration::from_secs(5)).await;
+                }
+            }
+
+            // If the watcher exits for some reason, wait a bit before restarting
+            warn!(
+                "Kubernetes watcher exited, restarting in {} seconds",
+                config.check_interval.as_secs()
+            );
+            time::sleep(config.check_interval).await;
+        }
+    });
+
+    Ok(handle)
+}
+
+async fn handle_pod_event(
+    pod_info: &PodInfo,
+    tracked_pods: Arc<Mutex<HashSet<PodInfo>>>,
+    worker_urls: Arc<RwLock<Vec<String>>>,
+    port: u16,
+) {
+    let worker_url = pod_info.worker_url(port);
+
+    // Check if pod is already tracked
+    let already_tracked = {
+        let tracker = tracked_pods.lock().unwrap();
+        tracker.contains(pod_info)
+    };
+
+    // If pod is healthy and not already tracked, add it
+    if pod_info.is_healthy() {
+        if !already_tracked {
+            info!(
+                "Adding healthy pod {} ({}) as worker",
+                pod_info.name, pod_info.ip
+            );
+
+            // Add URL to worker list
+            let mut urls = worker_urls.write().unwrap();
+            if !urls.contains(&worker_url) {
+                urls.push(worker_url.clone());
+                info!("Added new worker URL: {}", worker_url);
+            }
+
+            // Track this pod
+            let mut tracker = tracked_pods.lock().unwrap();
+            tracker.insert(pod_info.clone());
+        }
+    } else if already_tracked {
+        // If pod was healthy before but not anymore, remove it
+        handle_pod_deletion(pod_info, tracked_pods, worker_urls, port).await;
+    }
+}
+
+async fn handle_pod_deletion(
+    pod_info: &PodInfo,
+    tracked_pods: Arc<Mutex<HashSet<PodInfo>>>,
+    worker_urls: Arc<RwLock<Vec<String>>>,
+    port: u16,
+) {
+    let worker_url = pod_info.worker_url(port);
+
+    // Remove the pod from our tracking
+    let was_tracked = {
+        let mut tracker = tracked_pods.lock().unwrap();
+        tracker.remove(pod_info)
+    };
+
+    if was_tracked {
+        info!(
+            "Removing pod {} ({}) from workers",
+            pod_info.name, pod_info.ip
+        );
+
+        // Remove URL from worker list
+        let mut urls = worker_urls.write().unwrap();
+        if let Some(idx) = urls.iter().position(|url| url == &worker_url) {
+            urls.remove(idx);
+            info!("Removed worker URL: {}", worker_url);
+        }
+    }
+}