discovery.rs 4.51 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

use std::sync::Arc;

use dynamo_runtime::stream::StreamExt;

use dynamo_runtime::{
    DistributedRuntime,
    discovery::{
        DiscoveryEvent, DiscoveryInstance, DiscoveryInstanceId, DiscoveryQuery, DiscoveryStream,
    },
};
use serde::Deserialize;
use tokio_util::sync::CancellationToken;

use dynamo_kv_router::standalone_indexer::registry::WorkerRegistry;

/// Minimal subset of ModelDeploymentCard — only the fields the indexer needs.
/// Using `#[serde(default)]` on optional fields lets us safely ignore the rest.
#[derive(Deserialize, Debug)]
struct PartialModelCard {
    pub display_name: String,
    #[serde(default)]
    pub kv_cache_block_size: u32,
}

/// Spawn a background task that watches MDC discovery for worker additions/removals
/// and updates the WorkerRegistry accordingly.
pub async fn spawn_discovery_watcher(
    drt: &DistributedRuntime,
    registry: Arc<WorkerRegistry>,
    cancel_token: CancellationToken,
) -> anyhow::Result<()> {
    let discovery = drt.discovery();
    let mut stream: DiscoveryStream = discovery
        .list_and_watch(DiscoveryQuery::AllModels, Some(cancel_token.clone()))
        .await?;

    tokio::spawn(async move {
        tracing::info!("Discovery watcher started");

        while let Some(result) = stream.next().await {
            let event = match result {
                Ok(event) => event,
                Err(err) => {
                    tracing::error!(%err, "Error in discovery stream");
                    continue;
                }
            };

            match event {
                DiscoveryEvent::Added(instance) => {
                    let (instance_id, namespace, card) = match &instance {
                        DiscoveryInstance::Model {
                            instance_id,
                            namespace,
                            ..
                        } => match instance.deserialize_model::<PartialModelCard>() {
                            Ok(card) => (*instance_id, namespace.clone(), card),
                            Err(err) => {
                                tracing::error!(%err, instance_id, "Failed to deserialize model card");
                                continue;
                            }
                        },
                        _ => {
                            tracing::debug!("Ignoring non-model discovery instance");
                            continue;
                        }
                    };

                    let model_name = card.display_name.clone();
                    let block_size = card.kv_cache_block_size;
                    // Use the Dynamo namespace as the tenant_id
                    let tenant_id = namespace;

                    if block_size == 0 {
                        tracing::warn!(
                            instance_id,
                            model_name,
                            "Skipping worker with kv_cache_block_size=0"
                        );
                        continue;
                    }

                    tracing::info!(
                        instance_id,
                        model_name,
                        tenant_id,
                        block_size,
                        "Discovery: adding worker"
                    );

                    if let Err(e) = registry.add_worker_from_discovery(
                        instance_id,
                        model_name.clone(),
                        tenant_id,
                        block_size,
                    ) {
                        tracing::error!(
                            instance_id,
                            model_name,
                            error = %e,
                            "Failed to add discovered worker"
                        );
                    }
                }
                DiscoveryEvent::Removed(id) => {
                    let instance_id = match &id {
                        DiscoveryInstanceId::Model(mcid) => mcid.instance_id,
                        _ => {
                            tracing::debug!("Ignoring non-model discovery removal");
                            continue;
                        }
                    };

                    tracing::info!(instance_id, "Discovery: removing worker");
                    registry.remove_worker_from_discovery(instance_id).await;
                }
            }
        }

        tracing::info!("Discovery watcher exiting");
    });

    Ok(())
}