publisher.rs 37.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

16
17
18
use crate::kv_router::{
    indexer::{compute_block_hash_for_seq, RouterEvent},
    protocols::*,
19
20
    scoring::LoadEvent,
    KV_EVENT_SUBJECT, KV_METRICS_ENDPOINT, KV_METRICS_SUBJECT,
21
};
22
use async_trait::async_trait;
23
use dynamo_runtime::traits::{events::EventPublisher, DistributedRuntimeProvider};
Neelay Shah's avatar
Neelay Shah committed
24
use dynamo_runtime::{
25
    component::{Component, Namespace},
26
27
28
29
30
    pipeline::{
        network::Ingress, AsyncEngine, AsyncEngineContextProvider, ManyOut, ResponseStream,
        SingleIn,
    },
    protocols::annotated::Annotated,
31
    Error, Result,
32
};
33
34
35
use futures::stream;
use std::sync::Arc;
use tokio::sync::mpsc;
36
use tokio_util::sync::CancellationToken;
37

38
39
40
41
42
43
44
45
46
47
48
use rmp_serde as rmps;
use serde::Deserialize;
use serde::Serialize;
use std::sync::atomic::{AtomicU32, Ordering};
use std::time::Duration;
use zeromq::{Socket, SocketRecv, SubSocket};

// -------------------------------------------------------------------------
// KV Event Publishers -----------------------------------------------------
// -------------------------------------------------------------------------

49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
/// Configure the source of KV events.
/// Currently, only ZMQ is supported.
pub enum KvEventSourceConfig {
    Zmq { endpoint: String, topic: String },
}

/// The source of KV events.
enum KvEventSource {
    Zmq {
        zmq_handle: tokio::task::JoinHandle<()>,
    },
}

impl KvEventSource {
    /// Start the event source from a [`KvEventSourceConfig`].
    fn start(
        component: Component,
66
        kv_block_size: u32,
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
        source_config: KvEventSourceConfig,
        cancellation_token: CancellationToken,
        tx: mpsc::UnboundedSender<KvCacheEvent>,
    ) -> Result<Self> {
        match source_config {
            KvEventSourceConfig::Zmq { endpoint, topic } => {
                let zmq_handle = component
                    .drt()
                    .runtime()
                    .secondary()
                    .spawn(start_zmq_listener(
                        endpoint,
                        topic,
                        tx,
                        cancellation_token.clone(),
                        kv_block_size,
                    ));

                Ok(KvEventSource::Zmq { zmq_handle })
            }
        }
    }

    fn shutdown(&self) {
        match self {
            KvEventSource::Zmq { zmq_handle } => {
                zmq_handle.abort();
            }
        }
    }
}

/// A publisher of KV events.
GuanLuo's avatar
GuanLuo committed
100
pub struct KvEventPublisher {
101
    /// The size of the KV block.
102
    kv_block_size: u32,
103
104
105
106
107
108
    /// The source of KV events.
    /// Can be `None` if all events provided through [`KvEventPublisher::publish`].
    source: Option<KvEventSource>,
    /// The cancellation token.
    cancellation_token: CancellationToken,
    /// The channel to send events to.
109
    tx: mpsc::UnboundedSender<KvCacheEvent>,
110
111
}

GuanLuo's avatar
GuanLuo committed
112
impl KvEventPublisher {
113
114
115
    pub fn new(
        component: Component,
        worker_id: i64,
116
        kv_block_size: u32,
117
118
119
120
        source_config: Option<KvEventSourceConfig>,
    ) -> Result<Self> {
        let cancellation_token = CancellationToken::new();

121
122
        let (tx, rx) = mpsc::unbounded_channel::<KvCacheEvent>();

123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
        // Create our event source (if any)
        let mut source = None;
        if let Some(config) = source_config {
            source = Some(KvEventSource::start(
                component.clone(),
                kv_block_size,
                config,
                cancellation_token.clone(),
                tx.clone(),
            )?);
        }

        component
            .drt()
            .runtime()
            .secondary()
            .spawn(start_event_processor(
                component,
                worker_id,
                cancellation_token.clone(),
                rx,
            ));

        Ok(Self {
            kv_block_size,
            source,
            cancellation_token,
            tx,
        })
152
153
154
155
156
    }

    pub fn publish(&self, event: KvCacheEvent) -> Result<(), mpsc::error::SendError<KvCacheEvent>> {
        self.tx.send(event)
    }
157

158
    pub fn kv_block_size(&self) -> u32 {
159
160
        self.kv_block_size
    }
161

162
163
164
    pub fn shutdown(&mut self) {
        if !self.cancellation_token.is_cancelled() {
            self.cancellation_token.cancel();
165
        }
166

167
168
        if let Some(source) = self.source.take() {
            source.shutdown();
169
170
        }
    }
171
}
172

173
174
175
impl Drop for KvEventPublisher {
    fn drop(&mut self) {
        self.shutdown();
176
177
178
    }
}

179
180
async fn start_event_processor<P: EventPublisher + Send + Sync + 'static>(
    publisher: P,
181
    worker_id: i64,
182
183
    cancellation_token: CancellationToken,
    mut rx: mpsc::UnboundedReceiver<KvCacheEvent>,
184
185
186
187
) {
    loop {
        tokio::select! {
            _ = cancellation_token.cancelled() => {
188
                tracing::info!("KV Event source received cancellation signal");
189
190
                break;
            }
191
192
193
            event = rx.recv() => {
                let Some(event) = event else {
                    tracing::debug!("Event processor channel closed.");
194
195
196
                    break;
                };

197
                // Encapsulate in a router event and publish.
Alec's avatar
Alec committed
198
                tracing::trace!("Event processor for worker_id {} processing event: {:?}", worker_id, event.data);
199
200
201
                let router_event = RouterEvent::new(worker_id, event);
                if let Err(e) = publisher.publish(KV_EVENT_SUBJECT, &router_event).await {
                    tracing::error!("Failed to publish event: {}", e);
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
                }
            }
        }
    }
}

// Error handling configuration for ZMQ operations
const INITIAL_BACKOFF_MS: u64 = 10;
const MAX_BACKOFF_MS: u64 = 5000;
const MAX_CONSECUTIVE_ERRORS: u32 = 10;
const MAX_BACKOFF_EXPONENT: u32 = 8; // Cap at 2^8 = 256x multiplier to prevent overflow

/// Calculate exponential backoff duration based on consecutive error count
fn calculate_backoff_ms(consecutive_errors: u32) -> u64 {
    std::cmp::min(
        INITIAL_BACKOFF_MS * 2_u64.pow(consecutive_errors.min(MAX_BACKOFF_EXPONENT)),
        MAX_BACKOFF_MS,
    )
}

Yan Ru Pei's avatar
Yan Ru Pei committed
222
pub async fn start_zmq_listener(
223
224
    zmq_endpoint: String,
    zmq_topic: String,
225
226
    tx: mpsc::UnboundedSender<KvCacheEvent>,
    cancellation_token: CancellationToken,
227
    kv_block_size: u32,
228
229
230
231
232
233
234
) {
    tracing::debug!(
        "KVEventPublisher connecting to ZMQ endpoint {} (topic '{}')",
        zmq_endpoint,
        zmq_topic
    );

235
236
    let warning_count = Arc::new(AtomicU32::new(0));

237
238
239
240
241
242
243
244
245
246
247
248
249
250
    let mut socket = SubSocket::new();

    // Subscribe to the requested topic (empty string == all topics)
    if let Err(e) = socket.subscribe(&zmq_topic).await {
        tracing::error!("Failed to subscribe on ZMQ socket: {}", e);
        return;
    }

    if let Err(e) = socket.connect(&zmq_endpoint).await {
        tracing::error!("Failed to connect ZMQ SUB socket: {}", e);
        return;
    }

    let mut consecutive_errors = 0u32;
Alec's avatar
Alec committed
251
252
253
    #[allow(unused_assignments)]
    let mut exit_reason = "unknown";
    let mut messages_processed = 0u64;
254

Alec's avatar
Alec committed
255
    'main: loop {
256
257
258
259
        tokio::select! {
            biased;

            // Check for cancellation
260
            _ = cancellation_token.cancelled() => {
Alec's avatar
Alec committed
261
262
263
                tracing::debug!("ZMQ listener received cancellation signal");
                exit_reason = "cancellation token cancelled";
                break 'main;
264
265
266
267
268
269
270
271
272
273
274
275
276
277
            }

            // Receive message
            msg_result = socket.recv() => {
                let Ok(msg) = msg_result else {
                    let e = msg_result.unwrap_err();
                    consecutive_errors += 1;

                    if consecutive_errors >= MAX_CONSECUTIVE_ERRORS {
                        tracing::error!(
                            error=%e,
                            consecutive_errors=%consecutive_errors,
                            "Too many consecutive ZMQ errors, terminating listener"
                        );
Alec's avatar
Alec committed
278
279
                        exit_reason = "too many consecutive errors";
                        break 'main;
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
                    }

                    // Simple exponential backoff with max exponent to prevent overflow
                    let backoff_ms = calculate_backoff_ms(consecutive_errors);

                    tracing::warn!(
                        error=%e,
                        consecutive_errors=%consecutive_errors,
                        backoff_ms=%backoff_ms,
                        "Error reading from ZMQ socket, applying exponential backoff"
                    );

                    tokio::time::sleep(Duration::from_millis(backoff_ms)).await;
                    continue;
                };
                // Reset error count on successful message
                consecutive_errors = 0;

                // We expect multipart frames: [topic, seq, payload]
                let mut frames: Vec<Vec<u8>> = msg.into_vec().into_iter().map(|frame| frame.to_vec()).collect();

                if frames.len() != 3 {
                    tracing::warn!(expected=3, actual=%frames.len(), "Received unexpected ZMQ frame count");
                    continue;
                }
305
306
307
308

                // Extract the payload and sequence number.
                let payload = frames.pop().unwrap();
                let seq_bytes = frames.pop().unwrap();
309
310
311
312
313
314
315

                if seq_bytes.len() != 8 {
                    tracing::warn!(expected=8, actual=%seq_bytes.len(), "Invalid sequence number byte length");
                    continue;
                }

                let seq = u64::from_be_bytes(seq_bytes.try_into().unwrap());
316
317
318
319
320
321
322
323
324

                // Decode our batch of events.
                let batch_result = rmps::from_slice::<KvEventBatch>(&payload);
                let Ok(batch) = batch_result else {
                    let e = batch_result.unwrap_err();
                    tracing::warn!(error=%e, "Failed to decode KVEventBatch msgpack");
                    continue;
                };

Alec's avatar
Alec committed
325
326
327
328
329
330
                tracing::trace!(
                    "ZMQ listener on {} received batch with {} events (seq={})",
                    zmq_endpoint,
                    batch.events.len(),
                    seq
                );
331
                for raw_event in batch.events.into_iter() {
332
333
334
                    let event = convert_event(raw_event, seq, kv_block_size, &warning_count);
                    if tx.send(event).is_err() {
                        tracing::warn!("Failed to send message to channel - receiver dropped");
Alec's avatar
Alec committed
335
336
                        exit_reason = "channel receiver dropped";
                        break 'main;
337
                    }
Alec's avatar
Alec committed
338
                    messages_processed += 1;
339
340
341
342
                }
            }
        }
    }
Alec's avatar
Alec committed
343
344
345
346
347
    tracing::debug!(
        "ZMQ listener exiting, reason: {}, messages processed: {}",
        exit_reason,
        messages_processed
    );
348
349
350
}

/// Convert a raw event coming from the ZMQ channel into the internal
351
/// [`KvCacheEvent`] representation used by the router.
352
353
354
fn convert_event(
    raw: RawKvEvent,
    event_id: u64,
355
    kv_block_size: u32,
356
    warning_count: &Arc<AtomicU32>,
357
) -> KvCacheEvent {
358
359
360
361
362
363
364
365
366
    match raw {
        RawKvEvent::BlockStored {
            block_hashes,
            parent_block_hash,
            token_ids,
            block_size,
            lora_id,
        } => {
            let num_block_tokens = vec![block_size as u64; block_hashes.len()];
367
            KvCacheEvent {
368
369
370
371
372
373
374
375
376
377
378
379
                event_id,
                data: KvCacheEventData::Stored(KvCacheStoreData {
                    parent_hash: parent_block_hash.map(ExternalSequenceBlockHash::from),
                    blocks: create_stored_blocks(
                        kv_block_size,
                        &token_ids,
                        &num_block_tokens,
                        &block_hashes,
                        lora_id.unwrap_or(0),
                        warning_count,
                    ),
                }),
380
            }
381
382
383
384
385
386
        }
        RawKvEvent::BlockRemoved { block_hashes } => {
            let hashes = block_hashes
                .into_iter()
                .map(ExternalSequenceBlockHash::from)
                .collect();
387
            KvCacheEvent {
388
389
390
391
                event_id,
                data: KvCacheEventData::Removed(KvCacheRemoveData {
                    block_hashes: hashes,
                }),
392
            }
393
        }
394
395
396
397
        RawKvEvent::AllBlocksCleared => KvCacheEvent {
            event_id,
            data: KvCacheEventData::Cleared,
        },
398
399
400
401
    }
}

pub fn create_stored_block_from_parts(
402
    kv_block_size: u32,
403
404
405
406
407
408
409
410
411
412
413
414
    block_hash: i64,
    token_ids: &[u32],
    _lora_id: u64,
) -> KvCacheStoredBlockData {
    let tokens_hash = compute_block_hash_for_seq(token_ids, kv_block_size)[0];
    KvCacheStoredBlockData {
        block_hash: ExternalSequenceBlockHash::from(block_hash),
        tokens_hash,
    }
}

pub fn create_stored_blocks(
415
    kv_block_size: u32,
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
    token_ids: &[u32],
    num_block_tokens: &[u64],
    block_hashes: &[i64],
    lora_id: u64,
    warning_count: &Arc<AtomicU32>,
) -> Vec<KvCacheStoredBlockData> {
    let mut blocks: Vec<KvCacheStoredBlockData> = Vec::new();

    let mut token_offset: usize = 0;
    for (num_tokens_it, block_hash_it) in num_block_tokens.iter().zip(block_hashes.iter()) {
        if *num_tokens_it != kv_block_size as u64 {
            if warning_count.fetch_add(1, Ordering::Relaxed) < 3 {
                tracing::warn!(
                    "Block not published. Block size must be {} tokens to be published. Block size is: {}",
                    kv_block_size,
                    *num_tokens_it
                );
            }
            break;
        }

        let tokens = &token_ids[token_offset..(token_offset + *num_tokens_it as usize)];
        blocks.push(create_stored_block_from_parts(
            kv_block_size,
            *block_hash_it,
            tokens,
            lora_id,
        ));
        token_offset += *num_tokens_it as usize;
    }

    blocks
}

// -------------------------------------------------------------------------
// Types mirroring the Python msgspec-defined structures -------------------
// -------------------------------------------------------------------------

#[derive(Debug, Deserialize, Serialize)]
struct KvEventBatch {
    ts: f64,
    events: Vec<RawKvEvent>,
Alec's avatar
Alec committed
458
459
    #[serde(alias = "dp_rank")]
    data_parallel_rank: u32, // we are ignoring this for now
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
}

#[derive(Debug, Deserialize, Serialize)]
#[serde(tag = "type")] // msgspec encodes variant tag as a string when `tag=True`
enum RawKvEvent {
    BlockStored {
        block_hashes: Vec<i64>,
        parent_block_hash: Option<i64>,
        token_ids: Vec<u32>,
        block_size: usize,
        lora_id: Option<u64>,
    },
    BlockRemoved {
        block_hashes: Vec<i64>,
    },
    AllBlocksCleared,
}

// -------------------------------------------------------------------------
// Metrics Publishers ------------------------------------------------------
// -------------------------------------------------------------------------

482
pub struct WorkerMetricsPublisher {
GuanLuo's avatar
GuanLuo committed
483
484
485
486
    tx: tokio::sync::watch::Sender<Arc<ForwardPassMetrics>>,
    rx: tokio::sync::watch::Receiver<Arc<ForwardPassMetrics>>,
}

487
impl WorkerMetricsPublisher {
GuanLuo's avatar
GuanLuo committed
488
489
    pub fn new() -> Result<Self> {
        let (tx, rx) = tokio::sync::watch::channel(Arc::new(ForwardPassMetrics::default()));
490
        Ok(WorkerMetricsPublisher { tx, rx })
GuanLuo's avatar
GuanLuo committed
491
492
493
494
495
496
    }

    pub fn publish(
        &self,
        metrics: Arc<ForwardPassMetrics>,
    ) -> Result<(), tokio::sync::watch::error::SendError<Arc<ForwardPassMetrics>>> {
497
        tracing::trace!("Publish metrics: {metrics:?}");
GuanLuo's avatar
GuanLuo committed
498
499
500
        self.tx.send(metrics)
    }

501
    pub async fn create_endpoint(&self, component: Component) -> Result<()> {
GuanLuo's avatar
GuanLuo committed
502
        let mut metrics_rx = self.rx.clone();
503
        let handler = Arc::new(KvLoadEndpointHandler::new(metrics_rx.clone()));
504
505
        let handler = Ingress::for_engine(handler)?;

506
507
508
509
510
511
512
513
514
515
        let worker_id = component
            .drt()
            .primary_lease()
            .map(|lease| lease.id())
            .unwrap_or_else(|| {
                tracing::warn!("Component is static, assuming worker_id of 0");
                0
            });

        self.start_nats_metrics_publishing(component.namespace().clone(), worker_id);
516

517
        component
518
            .endpoint(KV_METRICS_ENDPOINT)
519
520
            .endpoint_builder()
            .stats_handler(move |_| {
GuanLuo's avatar
GuanLuo committed
521
522
                let metrics = metrics_rx.borrow_and_update().clone();
                serde_json::to_value(&*metrics).unwrap()
523
524
            })
            .handler(handler)
525
526
            .start()
            .await
527
    }
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604

    /// Starts a background task to publish metrics over NATS
    ///
    /// This task monitors metric changes (specifically kv_active_blocks and num_requests_waiting)
    /// and publishes stable metrics to NATS after they've been unchanged for 1ms.
    #[allow(dead_code)]
    fn start_nats_metrics_publishing(&self, namespace: Namespace, worker_id: i64) {
        let nats_rx = self.rx.clone();

        tokio::spawn(async move {
            let mut rx = nats_rx;
            let mut last_kv_active_blocks: Option<u64> = None;
            let mut last_num_requests_waiting: Option<u64> = None;
            let mut pending_publish: Option<Arc<ForwardPassMetrics>> = None;
            let mut publish_timer =
                Box::pin(tokio::time::sleep(tokio::time::Duration::from_secs(0)));
            publish_timer.as_mut().reset(tokio::time::Instant::now()); // Complete immediately

            loop {
                tokio::select! {
                    // Handle metrics changes
                    result = rx.changed() => {
                        if result.is_err() {
                            tracing::debug!(
                                "Metrics publisher sender dropped, stopping NATS background task"
                            );
                            break;
                        }

                        let metrics = rx.borrow_and_update().clone();

                        // Extract the values we care about
                        let current_kv_active_blocks = metrics.kv_stats.kv_active_blocks;
                        let current_num_requests_waiting =
                            metrics.worker_stats.num_requests_waiting;

                        // Check if these specific metrics have changed
                        let has_changed = match (last_kv_active_blocks, last_num_requests_waiting) {
                            (Some(last_kv), Some(last_requests)) => {
                                last_kv != current_kv_active_blocks
                                    || last_requests != current_num_requests_waiting
                            }
                            _ => true, // First time, consider it changed
                        };

                        // If load metrics changed, schedule a publish
                        if has_changed {
                            pending_publish = Some(metrics.clone());
                            last_kv_active_blocks = Some(current_kv_active_blocks);
                            last_num_requests_waiting = Some(current_num_requests_waiting);

                            // Start the 1ms timer
                            publish_timer.as_mut().reset(
                                tokio::time::Instant::now() + tokio::time::Duration::from_millis(1)
                            );
                        }
                    }
                    // Timer expired - publish if we have pending metrics
                    _ = &mut publish_timer => {
                        if let Some(metrics) = pending_publish.take() {
                            // Create LoadEvent wrapping the metrics
                            let load_event = LoadEvent {
                                worker_id,
                                data: (*metrics).clone(),
                            };

                            if let Err(e) =
                                namespace.publish(KV_METRICS_SUBJECT, &load_event).await
                            {
                                tracing::warn!("Failed to publish metrics over NATS: {}", e);
                            }
                        }
                    }
                }
            }
        });
    }
605
606
}

607
struct KvLoadEndpointHandler {
608
609
610
    metrics_rx: tokio::sync::watch::Receiver<Arc<ForwardPassMetrics>>,
}

611
impl KvLoadEndpointHandler {
612
613
614
615
616
617
618
    pub fn new(metrics_rx: tokio::sync::watch::Receiver<Arc<ForwardPassMetrics>>) -> Self {
        Self { metrics_rx }
    }
}

#[async_trait]
impl AsyncEngine<SingleIn<()>, ManyOut<Annotated<ForwardPassMetrics>>, Error>
619
    for KvLoadEndpointHandler
620
621
622
623
624
625
626
627
628
629
{
    async fn generate(
        &self,
        request: SingleIn<()>,
    ) -> Result<ManyOut<Annotated<ForwardPassMetrics>>> {
        let context = request.context();
        let metrics = self.metrics_rx.borrow().clone();
        let metrics = (*metrics).clone();
        let stream = stream::iter(vec![Annotated::from_data(metrics)]);
        Ok(ResponseStream::new(Box::pin(stream), context))
GuanLuo's avatar
GuanLuo committed
630
631
    }
}
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720

// -------------------------------------------------------------------------
// Testing -----------------------------------------------------------------
// -------------------------------------------------------------------------

#[cfg(test)]
mod test_event_processing {
    use super::*;
    use crate::kv_router::indexer::compute_block_hash_for_seq;

    // ---------------------------------------------------------------------
    // create_stored_block_from_parts --------------------------------------
    // ---------------------------------------------------------------------
    #[test]
    fn test_create_stored_block_from_parts() {
        let kv_block_size = 4;
        let token_ids = vec![10, 20, 30, 40];
        let blk_hash = 0xdead_beef;

        let stored = create_stored_block_from_parts(kv_block_size, blk_hash, &token_ids, 0);

        assert_eq!(stored.block_hash.0, blk_hash as u64);
        let expected_hash = compute_block_hash_for_seq(&token_ids, 4)[0];
        assert_eq!(stored.tokens_hash, expected_hash);
    }

    // ---------------------------------------------------------------------
    // create_stored_blocks -------------------------------------------------
    // ---------------------------------------------------------------------
    #[test]
    fn test_create_stored_blocks_ok() {
        let kv_block_size = 4;
        // two blocks, each of size 4
        let token_ids = vec![1, 2, 3, 4, 5, 6, 7, 8];
        let num_block_tokens = vec![4_u64, 4_u64];
        let block_hashes = vec![111_i64, 222_i64];

        let blocks = create_stored_blocks(
            kv_block_size,
            &token_ids,
            &num_block_tokens,
            &block_hashes,
            /*lora_id=*/ 0,
            &Arc::new(AtomicU32::new(0)),
        );

        assert_eq!(blocks.len(), 2);
        assert_eq!(blocks[0].block_hash.0, 111);
        assert_eq!(blocks[1].block_hash.0, 222);
    }

    #[test]
    fn test_create_stored_blocks_wrong_size_triggers_warning() {
        let kv_block_size = 4;
        // second block is the wrong size
        let token_ids = vec![1, 2, 3, 4, 5, 6, 7];
        let num_block_tokens = vec![4_u64, 3_u64];
        let block_hashes = vec![111_i64, 222_i64];
        let warning_count = Arc::new(AtomicU32::new(0));

        let blocks = create_stored_blocks(
            kv_block_size,
            &token_ids,
            &num_block_tokens,
            &block_hashes,
            /*lora_id=*/ 0,
            &warning_count,
        );

        // should early-exit as second has mismatch
        assert!(blocks.len() == 1);
        assert!(warning_count.load(Ordering::Relaxed) == 1)
    }

    // ---------------------------------------------------------------------
    // convert_event --------------------------------------------------------
    // ---------------------------------------------------------------------
    #[test]
    fn test_convert_event_block_stored() {
        let kv_block_size = 4;
        let raw_evt = RawKvEvent::BlockStored {
            block_hashes: vec![10, 11],
            parent_block_hash: Some(99),
            token_ids: vec![1, 2, 3, 4, 5, 6, 7, 8],
            block_size: 4,
            lora_id: Some(0),
        };

        let out = convert_event(raw_evt, 42, kv_block_size, &Arc::new(AtomicU32::new(0)));
721
        assert!(matches!(out.data, KvCacheEventData::Stored(_)));
722
723
724
725
726
727
728
729
730
731
    }

    #[test]
    fn test_convert_event_block_removed() {
        let kv_block_size = 4;
        let raw_evt = RawKvEvent::BlockRemoved {
            block_hashes: vec![123, 456],
        };
        let out = convert_event(raw_evt, 7, kv_block_size, &Arc::new(AtomicU32::new(0)));

732
        assert!(matches!(out.data, KvCacheEventData::Removed(_)));
733
734
735
736
737
738
    }

    #[test]
    fn test_convert_event_all_blocks_cleared() {
        let kv_block_size = 4;
        let raw_evt = RawKvEvent::AllBlocksCleared;
739
740
        let out = convert_event(raw_evt, 1, kv_block_size, &Arc::new(AtomicU32::new(0)));
        assert!(matches!(out.data, KvCacheEventData::Cleared));
741
742
743
744
745
746
    }
}

#[cfg(test)]
mod tests_startup_helpers {
    use super::*;
747
    use crate::kv_router::protocols::ExternalSequenceBlockHash;
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
    use async_trait;
    use bytes::Bytes;
    use std::sync::{Arc, Mutex};
    use zeromq::{PubSocket, Socket, SocketSend, ZmqMessage};

    // Type alias to resolve clippy::type_complexity warning
    type PublishedEvents = Arc<Mutex<Vec<(String, Vec<u8>)>>>;

    //--------------------------------------------------------------------
    // A tiny stand-in for Component that just records every publish call
    //--------------------------------------------------------------------
    #[derive(Default)]
    struct MockComponent {
        published: PublishedEvents,
    }

    impl MockComponent {
        fn new() -> (Self, PublishedEvents) {
            let published = Arc::new(Mutex::new(Vec::new()));
            (
                Self {
                    published: published.clone(),
                },
                published,
            )
        }
    }

    #[async_trait::async_trait]
    impl EventPublisher for MockComponent {
        async fn publish(
            &self,
            event_name: impl AsRef<str> + Send + Sync,
            event: &(impl serde::Serialize + Send + Sync),
        ) -> dynamo_runtime::Result<()> {
            let bytes = rmp_serde::to_vec(event).unwrap();
            self.published
                .lock()
                .unwrap()
                .push((event_name.as_ref().to_string(), bytes));
            Ok(())
        }

        async fn publish_bytes(
            &self,
            event_name: impl AsRef<str> + Send + Sync,
            bytes: Vec<u8>,
        ) -> dynamo_runtime::Result<()> {
            self.published
                .lock()
                .unwrap()
                .push((event_name.as_ref().to_string(), bytes));
            Ok(())
        }

        fn subject(&self) -> String {
            "mock.subject".into()
        }
    }

    //--------------------------------------------------------------------
809
    // Test start_event_processor
810
811
    //--------------------------------------------------------------------
    #[tokio::test]
812
813
814
815
816
817
818
819
    async fn test_start_event_processor() {
        let (component, published) = MockComponent::new();

        let event = KvCacheEvent {
            event_id: 1,
            data: KvCacheEventData::Removed(KvCacheRemoveData {
                block_hashes: vec![ExternalSequenceBlockHash(1), ExternalSequenceBlockHash(2)],
            }),
820
821
        };

822
823
824
        let token = CancellationToken::new();
        let (tx, rx) = mpsc::unbounded_channel::<KvCacheEvent>();
        tx.send(event).unwrap();
825
826
        drop(tx);

827
        let handle = tokio::spawn(start_event_processor(component, 1, token, rx));
828

829
        tokio::time::timeout(tokio::time::Duration::from_secs(1), handle)
830
831
832
833
834
            .await
            .unwrap()
            .unwrap();

        let published = published.lock().unwrap();
835
836
        assert_eq!(published.len(), 1);
        let (subject, _) = &published[0];
837
838
839
840
841
842
843
844
845
846
        assert_eq!(subject, &KV_EVENT_SUBJECT.to_string());
    }

    //--------------------------------------------------------------------
    // Test start_zmq_listener without a real socket
    //   (feed it frames through a ZMQ PAIR tcp socket)
    //--------------------------------------------------------------------
    #[tokio::test]
    async fn test_start_zmq_listener_pushes_to_channel() {
        // Prepare channel that listener should fill
847
        let (tx, mut rx) = mpsc::unbounded_channel::<KvCacheEvent>();
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862

        // ZMQ TCP endpoint using localhost with fixed port
        let endpoint = "tcp://127.0.0.1:15555";
        let topic = "".to_string(); // subscribe to all

        // Publisher side - set up first
        let mut pub_socket = PubSocket::new();
        pub_socket.bind(endpoint).await.unwrap();

        // Cancellation token so we can stop the listener
        let token = dynamo_runtime::CancellationToken::new();

        // Spawn async listener
        let listener_handle = tokio::spawn({
            let token = token.clone();
863
            start_zmq_listener(endpoint.to_string(), topic, tx, token, 4)
864
865
866
867
868
869
870
        });

        // Give time for the connection to establish
        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;

        // Send synthetic 3-frame message: [topic, seq(8B), payload]
        let seq: u64 = 77;
871
872
873
874
875
876
877
878
879

        let events = vec![RawKvEvent::BlockStored {
            block_hashes: vec![42],
            parent_block_hash: None,
            token_ids: vec![0, 1, 2, 3],
            block_size: 4,
            lora_id: None,
        }];

Alec's avatar
Alec committed
880
881
882
883
884
        let batch = KvEventBatch {
            ts: 0.0,
            events,
            data_parallel_rank: 1,
        };
885
886

        let payload = Bytes::from(rmps::to_vec(&batch).unwrap());
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903

        let frames = vec![
            Bytes::from(""),
            Bytes::from(seq.to_be_bytes().to_vec()),
            payload.clone(),
        ];

        // Create a proper multipart message
        let msg = ZmqMessage::try_from(frames).expect("Failed to create ZmqMessage");

        // Send the multipart message
        pub_socket.send(msg).await.unwrap();

        // Wait for message to be received
        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;

        // Check that we received the message
904
905
906
907
908
909
910
911
912
913
914
915
916
        let event = rx.try_recv().expect("no message received");

        let KvCacheEventData::Stored(KvCacheStoreData {
            parent_hash,
            blocks,
        }) = event.data
        else {
            panic!("expected KvCacheStoreData");
        };

        assert!(parent_hash.is_none());
        assert_eq!(blocks.len(), 1);
        assert_eq!(blocks[0].block_hash.0, 42);
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971

        // Stop the listener
        token.cancel();
        let _ = listener_handle.await;
    }
}

#[cfg(test)]
mod test_exponential_backoff {
    use super::*;

    #[test]
    fn test_backoff_calculation_progression() {
        // Test the exponential progression
        assert_eq!(calculate_backoff_ms(0), 10); // 10 * 2^0 = 10
        assert_eq!(calculate_backoff_ms(1), 20); // 10 * 2^1 = 20
        assert_eq!(calculate_backoff_ms(2), 40); // 10 * 2^2 = 40
        assert_eq!(calculate_backoff_ms(3), 80); // 10 * 2^3 = 80
        assert_eq!(calculate_backoff_ms(4), 160); // 10 * 2^4 = 160
        assert_eq!(calculate_backoff_ms(5), 320); // 10 * 2^5 = 320
        assert_eq!(calculate_backoff_ms(6), 640); // 10 * 2^6 = 640
        assert_eq!(calculate_backoff_ms(7), 1280); // 10 * 2^7 = 1280
        assert_eq!(calculate_backoff_ms(8), 2560); // 10 * 2^8 = 2560
    }

    #[test]
    fn test_backoff_caps_at_max_exponent() {
        // After MAX_BACKOFF_EXPONENT, should stay at 2^8 = 2560ms
        assert_eq!(calculate_backoff_ms(8), 2560);
        assert_eq!(calculate_backoff_ms(9), 2560); // Same as 8
        assert_eq!(calculate_backoff_ms(100), 2560); // Same as 8
    }

    #[test]
    fn test_backoff_never_exceeds_max() {
        // Even if we somehow had a huge exponent, never exceed MAX_BACKOFF_MS
        for i in 0..20 {
            assert!(calculate_backoff_ms(i) <= MAX_BACKOFF_MS);
        }
    }

    #[test]
    #[allow(clippy::assertions_on_constants)]
    fn test_backoff_constants_are_sane() {
        // Verify our constants make sense together
        assert!(INITIAL_BACKOFF_MS > 0);
        assert!(MAX_BACKOFF_MS > INITIAL_BACKOFF_MS);
        assert!(MAX_BACKOFF_EXPONENT <= 10); // Prevent crazy exponents
        assert!(MAX_CONSECUTIVE_ERRORS > 0);

        // Max calculated value should be less than MAX_BACKOFF_MS
        let max_calculated = INITIAL_BACKOFF_MS * 2_u64.pow(MAX_BACKOFF_EXPONENT);
        assert!(max_calculated <= MAX_BACKOFF_MS);
    }
}
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084

#[cfg(test)]
mod test_worker_metrics_publisher {
    use super::*;
    use crate::kv_router::protocols::{ForwardPassMetrics, KvStats, WorkerStats};
    use dynamo_runtime::traits::events::EventSubscriber; // Add this import
    use dynamo_runtime::{DistributedRuntime, Runtime};
    use futures::StreamExt;

    #[tokio::test]
    #[ignore] // Mark as ignored as requested
    async fn test_metrics_publishing_behavior() -> Result<()> {
        // Set up runtime and namespace
        let rt = Runtime::from_current().unwrap();
        let drt = DistributedRuntime::from_settings(rt.clone()).await?;
        let namespace = drt.namespace("test".to_string())?;

        // Create a subscriber for the metrics events using subscribe_with_type
        let mut subscriber = namespace
            .subscribe_with_type::<LoadEvent>(KV_METRICS_SUBJECT)
            .await
            .unwrap();

        // Create WorkerMetricsPublisher
        let publisher = WorkerMetricsPublisher::new().unwrap();
        let worker_id = 1234;

        // Start NATS metrics publishing
        publisher.start_nats_metrics_publishing(namespace.clone(), worker_id);

        // Allow some time for the background task to start
        tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;

        // Test 1: Publish 10 different metrics with 0.5ms intervals
        // Only the last one should be published after 1ms of stability
        for i in 0..10 {
            let metrics = Arc::new(ForwardPassMetrics {
                kv_stats: KvStats {
                    kv_active_blocks: (i * 100) as u64, // Changing load metric
                    kv_total_blocks: 1000,
                    gpu_cache_usage_perc: 0.5,
                    gpu_prefix_cache_hit_rate: 0.8,
                },
                worker_stats: WorkerStats {
                    num_requests_waiting: (i * 10) as u64, // Changing load metric
                    data_parallel_rank: None,
                    request_active_slots: 50,
                    request_total_slots: 100,
                },
                spec_decode_stats: None,
            });

            publisher.publish(metrics).unwrap();
            tokio::time::sleep(tokio::time::Duration::from_micros(100)).await;
        }

        // Wait a bit more than 1ms to ensure the last metric is published
        tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;

        // Verify we receive exactly one event with the last metric values
        let result =
            tokio::time::timeout(tokio::time::Duration::from_millis(500), subscriber.next())
                .await
                .unwrap();

        let event = result.unwrap().unwrap(); // Unwrap the Option and the Result
        assert_eq!(event.worker_id, worker_id);
        assert_eq!(event.data.kv_stats.kv_active_blocks, 900); // Last value: 9 * 100
        assert_eq!(event.data.worker_stats.num_requests_waiting, 90); // Last value: 9 * 10

        // Ensure no more events are waiting
        let no_msg =
            tokio::time::timeout(tokio::time::Duration::from_millis(50), subscriber.next()).await;
        assert!(no_msg.is_err(), "Expected no more messages, but found one");

        // Test 2: Publish 10 more metrics where everything changes EXCEPT the load metrics
        for i in 0..10 {
            let metrics = Arc::new(ForwardPassMetrics {
                kv_stats: KvStats {
                    kv_active_blocks: 900,                         // Keep same as last published
                    kv_total_blocks: 1000 + (i * 100) as u64,      // Change other metrics
                    gpu_cache_usage_perc: 0.3 + (i as f32 * 0.05), // Change other metrics
                    gpu_prefix_cache_hit_rate: 0.7 + (i as f32 * 0.01), // Change other metrics
                },
                worker_stats: WorkerStats {
                    num_requests_waiting: 90, // Keep same as last published
                    data_parallel_rank: None,
                    request_active_slots: 40 + (i * 5) as u64, // Change other metrics
                    request_total_slots: 100 + (i * 10) as u64, // Change other metrics
                },
                spec_decode_stats: None,
            });

            publisher.publish(metrics).unwrap();
            tokio::time::sleep(tokio::time::Duration::from_micros(100)).await;
        }

        // Wait to ensure no events are published
        tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;

        // Verify no events are received
        let no_msg =
            tokio::time::timeout(tokio::time::Duration::from_millis(50), subscriber.next()).await;
        assert!(
            no_msg.is_err(),
            "Expected no messages when load metrics don't change"
        );

        rt.shutdown();

        Ok(())
    }
}