protocols.rs

// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

use std::future::Future;
use std::ops::Range;
use std::time::Duration;

use dynamo_tokens::{SequenceHash, Token};
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
use xxhash_rust::xxh3;

const fn default_track_prefill_tokens() -> bool {
    true
}

/// The event subject that workers publish KV cache events on.
pub const KV_EVENT_SUBJECT: &str = "kv-events";

/// Seed for XXH3 hashing, consistent with indexer.rs
pub const XXH3_SEED: u64 = 1337;

/// Compute hash of data using XXH3 with the standard seed.
pub fn compute_hash(data: &[u8]) -> u64 {
    xxh3::xxh3_64_with_seed(data, XXH3_SEED)
}

/// Compute the hash of a local block.
pub fn compute_block_hash(data: &[u8]) -> LocalBlockHash {
    LocalBlockHash(compute_hash(data))
}

#[derive(Debug, Clone, Copy, Default)]
pub struct BlockHashOptions<'a> {
    pub block_mm_infos: Option<&'a [Option<BlockExtraInfo>]>,
    pub lora_name: Option<&'a str>,
    pub is_eagle: Option<bool>,
}

#[inline]
fn hash_block_no_mm(chunk: &[u32], seed: u64, scratch_bytes: &mut Vec<u8>) -> LocalBlockHash {
    #[cfg(target_endian = "little")]
    {
        let _ = scratch_bytes;
        // SAFETY: `u32` is plain-old-data, and on little-endian targets its in-memory
        // representation matches the `to_le_bytes()` sequence used for hashing.
        let chunk_bytes = unsafe {
            std::slice::from_raw_parts(chunk.as_ptr().cast::<u8>(), std::mem::size_of_val(chunk))
        };
        LocalBlockHash(xxh3::xxh3_64_with_seed(chunk_bytes, seed))
    }

    #[cfg(not(target_endian = "little"))]
    {
        scratch_bytes.clear();
        for &token in chunk {
            scratch_bytes.extend_from_slice(&token.to_le_bytes());
        }
        LocalBlockHash(xxh3::xxh3_64_with_seed(scratch_bytes, seed))
    }
}

/// Compute the hash for a sequence of tokens, optionally including multimodal metadata
/// and LoRA adapter identity.
///
/// When multimodal extra info is provided, the mm_hashes are included in the hash computation
/// to ensure that blocks with identical tokens but different multimodal objects produce
/// different hashes.
///
/// When `lora_name` is provided, the adapter name is mixed into the XXH3 seed so that
/// blocks cached under different LoRA adapters (or the base model) produce distinct hashes.
/// Because LoRA identity applies uniformly to every block in a sequence, encoding it in the
/// seed is more efficient than appending per-block bytes and matches the approach used by
/// KVBM's `SaltHash`.
pub fn compute_block_hash_for_seq(
    tokens: &[u32],
    kv_block_size: u32,
    options: BlockHashOptions<'_>,
) -> Vec<LocalBlockHash> {
    if kv_block_size == 0 {
        return Vec::new();
    }

    let seed = match options.lora_name.filter(|n| !n.is_empty()) {
        Some(name) => XXH3_SEED.wrapping_add(xxh3::xxh3_64(name.as_bytes())),
        None => XXH3_SEED,
    };
    let is_eagle_flag = options.is_eagle.unwrap_or(false);
    let stride = kv_block_size as usize;
    let window_size = if is_eagle_flag { stride + 1 } else { stride };
    let estimated_blocks = if is_eagle_flag {
        tokens.len().saturating_sub(1) / stride
    } else {
        tokens.len() / stride
    };
    let mut hashes = Vec::with_capacity(estimated_blocks);
    let mut bytes = Vec::with_capacity(window_size * std::mem::size_of::<u32>());
    let mut mm_hashes = Vec::new();
    let mut block_idx = 0;
    let mut start = 0;

    while start + window_size <= tokens.len() {
        let chunk = &tokens[start..start + window_size];
        if let Some(mm_infos) = options.block_mm_infos
            && let Some(Some(block_mm_info)) = mm_infos.get(block_idx)
        {
            bytes.clear();
            for &token in chunk {
                bytes.extend_from_slice(&token.to_le_bytes());
            }

            mm_hashes.clear();
            mm_hashes.extend(block_mm_info.mm_objects.iter().map(|obj| obj.mm_hash));
            mm_hashes.sort_unstable();

            for &mm_hash in &mm_hashes {
                bytes.extend_from_slice(&mm_hash.to_le_bytes());
            }

            hashes.push(LocalBlockHash(xxh3::xxh3_64_with_seed(&bytes, seed)));
        } else {
            hashes.push(hash_block_no_mm(chunk, seed, &mut bytes));
        }

        start += stride;
        block_idx += 1;
    }

    hashes
}

/// Compute rolling sequence hashes for a vector of block hashes.
///
/// - The first block's sequence hash equals its block hash
/// - Subsequent blocks' sequence hash = hash([parent_sequence_hash, current_block_hash], seed)
pub fn compute_seq_hash_for_block(block_hashes: &[LocalBlockHash]) -> Vec<SequenceHash> {
    if block_hashes.is_empty() {
        return Vec::new();
    }

    let mut sequence_hashes = Vec::with_capacity(block_hashes.len());
    sequence_hashes.push(block_hashes[0].0);

    for i in 1..block_hashes.len() {
        let parent_seq_hash = sequence_hashes[i - 1];
        let current_block_hash = block_hashes[i].0;

        let combined = [parent_seq_hash, current_block_hash];
        #[cfg(target_endian = "little")]
        let seq_hash = {
            // SAFETY: `u64` is plain-old-data, and on little-endian targets its in-memory
            // representation matches the `to_le_bytes()` sequence used by the previous code.
            let bytes = unsafe {
                std::slice::from_raw_parts(
                    combined.as_ptr().cast::<u8>(),
                    std::mem::size_of_val(&combined),
                )
            };
            compute_hash(bytes)
        };
        #[cfg(not(target_endian = "little"))]
        let seq_hash = {
            let mut bytes = [0_u8; std::mem::size_of::<u64>() * 2];
            bytes[..8].copy_from_slice(&parent_seq_hash.to_le_bytes());
            bytes[8..].copy_from_slice(&current_block_hash.to_le_bytes());
            compute_hash(&bytes)
        };
        sequence_hashes.push(seq_hash);
    }

    sequence_hashes
}

/// Trait abstracting the worker configuration fields needed by the scheduling layer.
///
/// `ModelRuntimeConfig` (in `lib/llm`) implements this directly so no adapter type is needed.
pub trait WorkerConfigLike {
    fn data_parallel_start_rank(&self) -> u32;
    fn data_parallel_size(&self) -> u32;
    fn max_num_batched_tokens(&self) -> Option<u64>;
    fn total_kv_blocks(&self) -> Option<u64>;
}

/// Transport abstraction for publishing batched router-visible KV cache events.
pub trait RouterEventSink: Send + Sync {
    fn publish_event(&self, event: &RouterEvent)
    -> impl Future<Output = anyhow::Result<()>> + Send;
}

/// A worker identifier.
pub type WorkerId = u64;

/// A data parallel rank identifier.
pub type DpRank = u32;

/// A worker identifier combined with its data parallel rank.
/// Used for routing decisions in data parallel setups.
/// dp_rank = 0 indicates either DP not enabled or the first rank.
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct WorkerWithDpRank {
    pub worker_id: WorkerId,
    pub dp_rank: DpRank,
}

impl WorkerWithDpRank {
    pub fn new(worker_id: WorkerId, dp_rank: DpRank) -> Self {
        Self { worker_id, dp_rank }
    }

    pub fn from_worker_id(worker_id: WorkerId) -> Self {
        Self {
            worker_id,
            dp_rank: 0,
        }
    }
}

#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq, Hash)]
#[serde(rename_all = "snake_case")]
pub enum StorageTier {
    #[default]
    Device,
    HostPinned,
    Disk,
    External,
}

impl StorageTier {
    pub fn from_kv_medium(medium: &str) -> Option<Self> {
        match medium {
            "GPU" | "DEVICE" => Some(Self::Device),
            "CPU_PINNED" | "CPU_TIER1" => Some(Self::HostPinned),
            "CPU_TIER2" | "DISK" | "NVME" => Some(Self::Disk),
            "EXTERNAL" | "NETWORK" | "REMOTE" | "SHARED" => Some(Self::External),
            _ => None,
        }
    }

    pub fn from_kv_medium_or_default(medium: Option<&str>) -> Self {
        medium
            .and_then(Self::from_kv_medium)
            .unwrap_or(Self::Device)
    }

    pub fn is_gpu(self) -> bool {
        matches!(self, Self::Device)
    }
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
pub enum PlacementOwner {
    LocalWorker(WorkerWithDpRank),
    Shared,
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
pub struct Placement {
    pub owner: PlacementOwner,
    pub tier: StorageTier,
}

impl Placement {
    pub fn local_worker(worker_id: WorkerId, dp_rank: DpRank, tier: StorageTier) -> Self {
        Self {
            owner: PlacementOwner::LocalWorker(WorkerWithDpRank::new(worker_id, dp_rank)),
            tier,
        }
    }

    pub fn local_gpu(worker_id: WorkerId, dp_rank: DpRank) -> Self {
        Self::local_worker(worker_id, dp_rank, StorageTier::Device)
    }

    pub fn is_local_gpu(&self) -> bool {
        matches!(self.owner, PlacementOwner::LocalWorker(_)) && self.tier.is_gpu()
    }
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct PlacementEvent {
    pub placement: Placement,
    pub event: KvCacheEvent,
}

impl PlacementEvent {
    pub fn new(placement: Placement, event: KvCacheEvent) -> Self {
        Self { placement, event }
    }

    pub fn local_gpu(worker_id: WorkerId, event: KvCacheEvent) -> Self {
        Self::new(Placement::local_gpu(worker_id, event.dp_rank), event)
    }

    pub fn into_router_event(self) -> Option<RouterEvent> {
        let PlacementOwner::LocalWorker(worker) = self.placement.owner else {
            return None;
        };
        Some(RouterEvent::with_storage_tier(
            worker.worker_id,
            self.event,
            self.placement.tier,
        ))
    }
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "method", rename_all = "snake_case")]
pub enum RouterRequest {
    #[serde(rename = "new")]
    New {
        tokens: Vec<Token>,
        #[serde(default, skip_serializing_if = "Option::is_none")]
        block_mm_infos: Option<Vec<Option<BlockExtraInfo>>>,
    },
    MarkPrefill,
    MarkFree {
        // once request is cancelled, the frontend might not be allowed to send a
        // request with linking the id. In this case, the request_id is provided in the payload.
        #[serde(default, skip_serializing_if = "Option::is_none")]
        request_id: Option<String>,
    },
}

impl Default for RouterRequest {
    fn default() -> Self {
        RouterRequest::New {
            tokens: vec![],
            block_mm_infos: None,
        }
    }
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "method", rename_all = "snake_case")]
pub enum RouterResponse {
    New {
        worker_id: WorkerId,
        #[serde(default)]
        dp_rank: DpRank,
        overlap_blocks: u32,
    },
    PrefillMarked {
        success: bool,
    },
    FreeMarked {
        success: bool,
    },
}

#[derive(Debug)]
pub struct WorkerSelectionResult {
    /// The full worker information including dp_rank
    pub worker: WorkerWithDpRank,

    /// The total number of blocks required to prefill the request
    pub required_blocks: u64,

    /// The number of blocks that the selected worker may already have cached.
    /// This is not a guarantee, but an estimate.
    pub overlap_blocks: u32,
}

/// Active load metrics for a worker, used for busy detection.
///
/// Published by workers (with `kv_used_blocks`) and by the scheduler (with
/// `active_decode_blocks` and `active_prefill_tokens`).
#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)]
pub struct ActiveLoad {
    pub worker_id: WorkerId,
    #[serde(default)]
    pub dp_rank: DpRank,
    /// Scheduler-reported decode block load.
    pub active_decode_blocks: Option<u64>,
    /// Number of active prefill tokens (from scheduler's view).
    pub active_prefill_tokens: Option<u64>,
    /// Total KV blocks currently in use on the worker.
    ///
    /// This is published by workers only and is the authoritative signal for
    /// backend KV occupancy used by busy detection.
    #[serde(default)]
    pub kv_used_blocks: Option<u64>,
}

/// A [`LocalBlockHash`] is a hash computed from the token IDs, optional multimodal metadata,
/// and optional LoRA adapter name of a block.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Ord, PartialOrd)]
pub struct LocalBlockHash(pub u64);

/// A sequence-aware hash of a block computed by the engine from token IDs, optional metadata,
/// and the hash of the parent block.
///
/// In this case, the hashing function is external and unknown.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Ord, PartialOrd)]
pub struct ExternalSequenceBlockHash(pub u64);

// Implement From trait for convenient conversion
impl From<u64> for ExternalSequenceBlockHash {
    fn from(value: u64) -> Self {
        Self(value)
    }
}

impl From<i64> for ExternalSequenceBlockHash {
    /// Bitwise reinterpretation: preserves all bits, including negatives.
    /// This is lossless, but negative i64 values will appear as large u64 values.
    fn from(value: i64) -> Self {
        Self(value as u64)
    }
}

#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct PrefillEvent {
    pub request_id: String,
    pub worker_id: WorkerId,
    pub data: PrefillEventData,
    pub router_id: u64,
}

/// Represents the different stages of prefilling tokens for a request.
///
/// Each variant contains a `usize` representing the number of tokens
/// that are pending prefill in the request.
#[derive(Serialize, Deserialize, Debug, Clone)]
pub enum PrefillEventData {
    NewPrefill(usize),
    UpdatePrefill(usize),
    CompletePrefill,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct ActiveSequenceEvent {
    pub request_id: String,
    pub worker: WorkerWithDpRank,
    pub data: ActiveSequenceEventData,
    pub router_id: u64,
    #[serde(default)]
    pub lora_name: Option<String>,
}

#[derive(Serialize, Deserialize, Debug, Clone, Copy, PartialEq, Eq)]
pub struct PrefillLoadHint {
    pub initial_effective_prefill_tokens: usize,
    pub expected_prefill_duration: Option<Duration>,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
pub enum ActiveSequenceEventData {
    AddRequest {
        token_sequence: Option<Vec<SequenceHash>>,
        #[serde(default = "default_track_prefill_tokens")]
        track_prefill_tokens: bool,
        expected_output_tokens: Option<u32>,
        #[serde(default)]
        prefill_load_hint: Option<PrefillLoadHint>,
    },
    Free,
    MarkPrefillCompleted,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct ActiveBlockEvent {
    pub request_id: String,
    pub data: ActiveBlockEventData,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
pub enum ActiveBlockEventData {
    NewBlock(Vec<SequenceHash>),
    FreeBlock,
}

/// Represents a collection of cache events and a shutdown flag.
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct KvCacheEvents {
    /// A list of cache events.
    pub events: Vec<KvCacheEvent>,
    /// A flag indicating whether the cache is shutting down.
    pub shutdown: bool,
}

/// Represents a single cache event with an ID and associated data.
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct KvCacheEvent {
    /// The unique identifier of the event.
    pub event_id: u64,
    /// The data associated with the event.
    pub data: KvCacheEventData,
    /// The data parallel rank of the worker emitting this event (0 if DP not enabled).
    #[serde(default)]
    pub dp_rank: DpRank,
}

/// Represents the data associated with a cache event.
///
/// Data is either stored or removed.
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum KvCacheEventData {
    Stored(KvCacheStoreData),
    Removed(KvCacheRemoveData),
    Cleared,
}

/// Represents the data associated with a stored cache event.
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct KvCacheStoreData {
    /// The optional hash of the parent block.
    pub parent_hash: Option<ExternalSequenceBlockHash>,
    /// Absolute position of the first block in this batch for positional replay.
    #[serde(default)]
    pub start_position: Option<u32>,
    /// A list of stored blocked data.
    pub blocks: Vec<KvCacheStoredBlockData>,
}

/// Multimodal object information within a block.
/// Offsets are relative to the block (0 to block_size-1).
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct BlockMmObjectInfo {
    /// Hash identifying this multimodal object
    pub mm_hash: u64,
    /// Token offset ranges where this MM object's placeholders appear within THIS block
    /// Each tuple is (start_offset, end_offset) relative to block start
    pub offsets: Vec<(usize, usize)>,
}

/// Extra metadata for a block containing multimodal objects
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct BlockExtraInfo {
    /// All multimodal objects referenced in this block
    pub mm_objects: Vec<BlockMmObjectInfo>,
}

/// Request-level multimodal object information.
/// Offsets are relative to the entire request token sequence.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct RequestMmObjectInfo {
    /// Hash identifying this multimodal object
    pub mm_hash: u64,
    /// Token offset ranges where this MM object's placeholders appear in the ENTIRE request
    /// Each tuple is (start_offset, end_offset) relative to request start
    pub offsets: Vec<(usize, usize)>,
}

/// Request-level multimodal metadata
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct RequestExtraInfo {
    /// All multimodal objects in this request
    pub mm_objects: Vec<RequestMmObjectInfo>,
}

impl RequestExtraInfo {
    /// Convert request-level MM info to block-level MM info for a sequence of blocks.
    ///
    /// This function splits request-level offsets (relative to the entire request token sequence)
    /// into block-level offsets (relative to each block).
    ///
    /// # Arguments
    /// * `block_size` - The size of each block in tokens
    /// * `total_tokens` - Total number of tokens in the request
    ///
    /// # Returns
    /// A vector of `Option<BlockExtraInfo>` where each element corresponds to a block.
    /// `None` indicates a block with no multimodal objects.
    pub fn to_block_level(
        &self,
        block_size: usize,
        total_tokens: usize,
    ) -> Vec<Option<BlockExtraInfo>> {
        let num_blocks = total_tokens.div_ceil(block_size);
        let mut block_infos: Vec<Option<BlockExtraInfo>> = vec![None; num_blocks];

        for req_mm_obj in &self.mm_objects {
            for (req_start, req_end) in &req_mm_obj.offsets {
                // Find which blocks this offset range spans
                let start_block = req_start / block_size;
                let end_block = (req_end.saturating_sub(1)) / block_size;

                let upper_bound = end_block.min(num_blocks - 1) + 1;
                for (block_idx, block_info_opt) in block_infos
                    .iter_mut()
                    .enumerate()
                    .take(upper_bound)
                    .skip(start_block)
                {
                    let block_start_global = block_idx * block_size;
                    let block_end_global = ((block_idx + 1) * block_size).min(total_tokens);

                    // Calculate the intersection of this MM object's range with this block
                    let local_start = (*req_start).max(block_start_global) - block_start_global;
                    let local_end = (*req_end).min(block_end_global) - block_start_global;

                    if local_start < local_end {
                        let block_info = block_info_opt
                            .get_or_insert_with(|| BlockExtraInfo { mm_objects: vec![] });

                        // Check if we already have this mm_hash in this block
                        if let Some(existing) = block_info
                            .mm_objects
                            .iter_mut()
                            .find(|obj| obj.mm_hash == req_mm_obj.mm_hash)
                        {
                            // Add the offset range to existing object
                            existing.offsets.push((local_start, local_end));
                        } else {
                            // Create new MM object entry for this block
                            block_info.mm_objects.push(BlockMmObjectInfo {
                                mm_hash: req_mm_obj.mm_hash,
                                offsets: vec![(local_start, local_end)],
                            });
                        }
                    }
                }
            }
        }

        block_infos
    }
}

/// Represents data for a stored block.
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct KvCacheStoredBlockData {
    /// The hash of the block.
    pub block_hash: ExternalSequenceBlockHash,
    /// The hash of the tokens in the block.
    pub tokens_hash: LocalBlockHash,
    /// Extra multimodal metadata for this block
    /// Note: Do NOT use skip_serializing_if with bincode - it breaks deserialization
    /// because bincode is positional and expects all fields to be present.
    #[serde(default)]
    pub mm_extra_info: Option<BlockExtraInfo>,
}

/// Represents the data associated with a removed cache event.
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct KvCacheRemoveData {
    /// A list of block hashes to remove.
    pub block_hashes: Vec<ExternalSequenceBlockHash>,
}

impl Serialize for LocalBlockHash {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: serde::Serializer,
    {
        serializer.serialize_u64(self.0)
    }
}

impl<'de> Deserialize<'de> for LocalBlockHash {
    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
    where
        D: serde::Deserializer<'de>,
    {
        let value = u64::deserialize(deserializer)?;
        Ok(LocalBlockHash(value))
    }
}

impl Serialize for ExternalSequenceBlockHash {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: serde::Serializer,
    {
        serializer.serialize_u64(self.0)
    }
}

impl<'de> Deserialize<'de> for ExternalSequenceBlockHash {
    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
    where
        D: serde::Deserializer<'de>,
    {
        let value = u64::deserialize(deserializer)?;
        Ok(ExternalSequenceBlockHash(value))
    }
}

// ------
// Router Event Types
// ------

/// Errors that can occur during KV Cache Event processing.
#[derive(Debug, thiserror::Error)]
pub enum KvCacheEventError {
    #[error("Failed to find parent block")]
    ParentBlockNotFound,

    #[error("Failed to find block")]
    BlockNotFound,

    #[error("Invalid block sequence")]
    InvalidBlockSequence,
}

/// A [`KvCacheEvent`] on a specific LLM worker denoted by [`WorkerId`].
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct RouterEvent {
    /// The ID of the worker emitting the event.
    pub worker_id: WorkerId,
    /// The storage tier associated with the event.
    #[serde(default)]
    pub storage_tier: StorageTier,
    /// The cache event associated with the worker.
    pub event: KvCacheEvent,
}

impl RouterEvent {
    /// Create a new `RouterEvent`.
    ///
    /// ### Arguments
    ///
    /// * `worker_id` - The ID of the worker emitting the event.
    /// * `event` - The cache event.
    ///
    /// ### Returns
    ///
    /// A new `RouterEvent`.
    pub fn new(worker_id: WorkerId, event: KvCacheEvent) -> Self {
        Self::with_storage_tier(worker_id, event, StorageTier::Device)
    }

    pub fn with_storage_tier(
        worker_id: WorkerId,
        event: KvCacheEvent,
        storage_tier: StorageTier,
    ) -> Self {
        Self {
            worker_id,
            storage_tier,
            event,
        }
    }
}

/// Shared cache hit information, represented as sorted non-overlapping half-open ranges.
///
/// Ranges encode which block positions exist in the external shared KV cache pool.
/// Using ranges instead of `Vec<bool>` avoids iterating over potentially thousands
/// of blocks per worker. Typical shared cache patterns produce few contiguous regions,
/// making `hits_beyond` O(num_ranges) ~ O(1-5).
#[derive(Debug, Clone, Default)]
pub struct SharedCacheHits {
    /// Ranges of block positions that exist in the shared cache.
    /// Half-open ranges [start, end), sorted and non-overlapping.
    pub ranges: Vec<Range<u32>>,
    /// Total number of hits (sum of range lengths).
    pub total_hits: u32,
}

impl SharedCacheHits {
    /// Create from sorted, non-overlapping ranges.
    pub fn from_ranges(ranges: Vec<Range<u32>>) -> Self {
        let total_hits = ranges.iter().map(|r| r.end - r.start).sum();
        Self { ranges, total_hits }
    }

    /// Create from a boolean hit vector (convenience for tests and simple backends).
    /// Coalesces consecutive `true` entries into ranges.
    pub fn from_hits(hits: &[bool]) -> Self {
        let mut ranges = Vec::new();
        let mut i = 0;
        while i < hits.len() {
            if hits[i] {
                let start = i as u32;
                while i < hits.len() && hits[i] {
                    i += 1;
                }
                ranges.push(start..i as u32);
            } else {
                i += 1;
            }
        }
        Self::from_ranges(ranges)
    }

    /// Count hits at positions >= `from_position`.
    /// O(num_ranges), not O(num_blocks).
    pub fn hits_beyond(&self, from_position: u32) -> u32 {
        self.ranges
            .iter()
            .map(|r| {
                if r.end <= from_position {
                    0
                } else if r.start >= from_position {
                    r.end - r.start
                } else {
                    r.end - from_position
                }
            })
            .sum()
    }
}

/// Scores representing the overlap of workers (with their dp_rank).
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OverlapScores {
    /// Map of worker (with dp_rank) to score.
    pub scores: FxHashMap<WorkerWithDpRank, u32>,
    /// List of frequencies that the blocks have been accessed. Entries with value 0 are omitted.
    pub frequencies: Vec<usize>,
    /// Map of worker to their tree size (number of blocks in the tree for that worker).
    pub tree_sizes: FxHashMap<WorkerWithDpRank, usize>,
}

impl Default for OverlapScores {
    fn default() -> Self {
        Self::new()
    }
}

impl OverlapScores {
    /// Create a new `OverlapScores`.
    ///
    /// ### Returns
    ///
    /// A new `OverlapScores`.
    pub fn new() -> Self {
        Self {
            scores: FxHashMap::default(),
            frequencies: Vec::with_capacity(32),
            tree_sizes: FxHashMap::default(),
        }
    }

    /// Update the scores with a set of workers.
    ///
    /// ### Arguments
    ///
    /// * `workers` - An iterator over `WorkerWithDpRank` references.
    pub fn update_scores<'a, I>(&mut self, workers: I)
    where
        I: IntoIterator<Item = &'a WorkerWithDpRank>,
    {
        for worker in workers {
            let score = self.scores.entry(*worker).or_insert(0);
            *score += 1;
        }
    }

    /// Add an entry in the frequency list.
    pub fn add_frequency(&mut self, frequency: usize) {
        if frequency != 0 {
            self.frequencies
                .last()
                .inspect(|elem| debug_assert!(**elem >= frequency));
            self.frequencies.push(frequency);
        }
    }
}

// ------
// TokensWithHashes
// ------

/// A container for tokens with lazily computed block and sequence hashes.
///
/// This struct avoids redundant hash computations by caching results:
/// - `get_or_compute_block_hashes()` computes block hashes if not cached
/// - `get_or_compute_seq_hashes()` computes seq hashes if not cached,
///   and will also compute block hashes first if needed (since seq hashes depend on them)
#[derive(Debug, Clone)]
pub struct TokensWithHashes {
    tokens: Vec<u32>,
    block_size: u32,
    block_mm_infos: Option<Vec<Option<BlockExtraInfo>>>,
    lora_name: Option<String>,
    block_hashes: Option<Vec<LocalBlockHash>>,
    seq_hashes: Option<Vec<SequenceHash>>,
    is_eagle: Option<bool>,
}

impl TokensWithHashes {
    /// Creates a new TokensWithHashes from tokens and block size.
    pub fn new(tokens: Vec<u32>, block_size: u32) -> Self {
        Self {
            tokens,
            block_size,
            block_mm_infos: None,
            lora_name: None,
            block_hashes: None,
            seq_hashes: None,
            is_eagle: None,
        }
    }

    /// Adds multimodal extra info for blocks.
    pub fn with_mm_infos(mut self, infos: Vec<Option<BlockExtraInfo>>) -> Self {
        self.block_mm_infos = Some(infos);
        self
    }

    /// Sets the LoRA adapter name for hash computation.
    pub fn with_lora_name(mut self, name: String) -> Self {
        self.lora_name = Some(name);
        self
    }

    /// Sets Eagle hashing semantics for this token sequence.
    pub fn with_is_eagle(mut self, is_eagle: bool) -> Self {
        self.set_is_eagle(is_eagle);
        self
    }

    /// Updates Eagle hashing semantics and invalidates cached hashes when it changes.
    pub fn set_is_eagle(&mut self, is_eagle: bool) {
        let is_eagle = Some(is_eagle);
        if self.is_eagle == is_eagle {
            return;
        }

        self.is_eagle = is_eagle;
        self.block_hashes = None;
        self.seq_hashes = None;
    }

    /// Returns a reference to the tokens.
    pub fn tokens(&self) -> &[u32] {
        &self.tokens
    }

    /// Returns the number of tokens.
    pub fn len(&self) -> usize {
        self.tokens.len()
    }

    /// Returns true if there are no tokens.
    pub fn is_empty(&self) -> bool {
        self.tokens.is_empty()
    }

    /// Returns the block size.
    pub fn block_size(&self) -> u32 {
        self.block_size
    }

    /// Returns the multimodal extra info, if set.
    pub fn block_mm_infos(&self) -> Option<&[Option<BlockExtraInfo>]> {
        self.block_mm_infos.as_deref()
    }

    /// Returns block hashes, computing them if not already cached.
    pub fn get_or_compute_block_hashes(&mut self) -> &[LocalBlockHash] {
        if self.block_hashes.is_none() {
            self.block_hashes = Some(compute_block_hash_for_seq(
                &self.tokens,
                self.block_size,
                BlockHashOptions {
                    block_mm_infos: self.block_mm_infos.as_deref(),
                    lora_name: self.lora_name.as_deref(),
                    is_eagle: self.is_eagle,
                },
            ));
        }
        self.block_hashes.as_ref().unwrap()
    }

    /// Returns sequence hashes, computing them if not already cached.
    /// This will also compute block hashes if they haven't been computed yet,
    /// since sequence hashes depend on block hashes.
    pub fn get_or_compute_seq_hashes(&mut self) -> &[SequenceHash] {
        if self.seq_hashes.is_none() {
            // Ensure block hashes are computed first
            let block_hashes = self.get_or_compute_block_hashes();
            self.seq_hashes = Some(compute_seq_hash_for_block(block_hashes));
        }
        self.seq_hashes.as_ref().unwrap()
    }

    /// Returns cached block hashes without computing. Returns None if not yet computed.
    pub fn block_hashes(&self) -> Option<&[LocalBlockHash]> {
        self.block_hashes.as_deref()
    }

    /// Returns cached seq hashes without computing. Returns None if not yet computed.
    pub fn seq_hashes(&self) -> Option<&[SequenceHash]> {
        self.seq_hashes.as_deref()
    }
}

// ------
// Tests
// ------
#[cfg(test)]
mod tests {
    use super::*;
    use rstest::rstest;
    use serde_json;

    #[test]
    fn test_router_event_new() {
        let worker_id = 0;
        let kv_cache_event = KvCacheEvent {
            event_id: 1,
            data: KvCacheEventData::Stored(KvCacheStoreData {
                parent_hash: None,
                start_position: None,
                blocks: vec![KvCacheStoredBlockData {
                    block_hash: ExternalSequenceBlockHash(0),
                    mm_extra_info: None,
                    tokens_hash: LocalBlockHash(13226331709069118873),
                }],
            }),
            dp_rank: 0,
        };
        let router_event = RouterEvent::new(worker_id, kv_cache_event);

        assert_eq!(router_event.worker_id, worker_id);
        assert_eq!(router_event.event.event_id, 1);
        if let KvCacheEventData::Stored(store_op) = &router_event.event.data {
            assert_eq!(store_op.blocks.len(), 1);
            assert_eq!(
                store_op.blocks[0].tokens_hash,
                compute_block_hash(b"test data")
            );
            assert_eq!(store_op.blocks[0].block_hash, ExternalSequenceBlockHash(0));
        } else {
            panic!("Expected KvCacheEventData::Stored");
        }
    }

    #[test]
    fn test_overlap_scores_default() {
        let overlap_scores: OverlapScores = Default::default();
        assert!(overlap_scores.scores.is_empty());
    }

    #[rstest]
    #[case(11)]
    #[case(32)]
    #[case(64)]
    fn test_compute_block_hash_for_seq(#[case] kv_block_size: u32) {
        let sequence = (0..kv_block_size).collect::<Vec<u32>>();
        let hashes =
            compute_block_hash_for_seq(&sequence, kv_block_size, BlockHashOptions::default());
        assert_eq!(hashes.len(), 1);

        let sequence = (0..(kv_block_size + 1)).collect::<Vec<u32>>();
        let hashes =
            compute_block_hash_for_seq(&sequence, kv_block_size, BlockHashOptions::default());
        assert_eq!(hashes.len(), 1);

        let sequence = (0..(2 * kv_block_size + 1)).collect::<Vec<u32>>();
        let hashes =
            compute_block_hash_for_seq(&sequence, kv_block_size, BlockHashOptions::default());
        assert_eq!(hashes.len(), 2);
    }

    #[test]
    fn test_lora_name_produces_different_hash() {
        let tokens: Vec<u32> = (0..4).collect();
        let base = compute_block_hash_for_seq(&tokens, 4, BlockHashOptions::default());
        let lora_a = compute_block_hash_for_seq(
            &tokens,
            4,
            BlockHashOptions {
                lora_name: Some("adapter-a"),
                ..Default::default()
            },
        );
        let lora_b = compute_block_hash_for_seq(
            &tokens,
            4,
            BlockHashOptions {
                lora_name: Some("adapter-b"),
                ..Default::default()
            },
        );

        assert_ne!(base[0], lora_a[0]);
        assert_ne!(base[0], lora_b[0]);
        assert_ne!(lora_a[0], lora_b[0]);
    }

    #[test]
    fn test_lora_name_empty_string_normalized_to_none() {
        let tokens: Vec<u32> = (0..4).collect();
        let base = compute_block_hash_for_seq(&tokens, 4, BlockHashOptions::default());
        let empty = compute_block_hash_for_seq(
            &tokens,
            4,
            BlockHashOptions {
                lora_name: Some(""),
                ..Default::default()
            },
        );
        assert_eq!(
            base, empty,
            "empty lora_name should be treated as base model"
        );
    }

    #[test]
    fn test_tokens_with_hashes_lora() {
        let tokens: Vec<u32> = (0..8).collect();

        let mut base = TokensWithHashes::new(tokens.clone(), 4);
        let base_hashes = base.get_or_compute_block_hashes().to_vec();

        let mut with_lora =
            TokensWithHashes::new(tokens, 4).with_lora_name("my-adapter".to_string());
        let lora_hashes = with_lora.get_or_compute_block_hashes().to_vec();

        assert_eq!(base_hashes.len(), lora_hashes.len());
        for (b, l) in base_hashes.iter().zip(lora_hashes.iter()) {
            assert_ne!(b, l);
        }
    }

    #[test]
    fn test_compute_block_hash_for_seq_eagle_windows() {
        let tokens: Vec<u32> = (0..6).collect();

        let default_hashes = compute_block_hash_for_seq(&tokens, 2, BlockHashOptions::default());
        let eagle_hashes = compute_block_hash_for_seq(
            &tokens,
            2,
            BlockHashOptions {
                is_eagle: Some(true),
                ..Default::default()
            },
        );
        let expected_first = compute_block_hash_for_seq(
            &[0, 1, 2],
            2,
            BlockHashOptions {
                is_eagle: Some(true),
                ..Default::default()
            },
        );
        let expected_second = compute_block_hash_for_seq(
            &[2, 3, 4],
            2,
            BlockHashOptions {
                is_eagle: Some(true),
                ..Default::default()
            },
        );

        assert_eq!(default_hashes.len(), 3);
        assert_eq!(eagle_hashes.len(), 2);
        assert_eq!(eagle_hashes, vec![expected_first[0], expected_second[0]]);
        assert_ne!(default_hashes[0], eagle_hashes[0]);
    }

    #[test]
    fn test_tokens_with_hashes_set_is_eagle_invalidates_cache() {
        let tokens: Vec<u32> = (0..6).collect();
        let mut with_hashes = TokensWithHashes::new(tokens, 2);

        let default_hashes = with_hashes.get_or_compute_block_hashes().to_vec();
        with_hashes.set_is_eagle(true);
        let eagle_hashes = with_hashes.get_or_compute_block_hashes().to_vec();
        let expected_first = compute_block_hash_for_seq(
            &[0, 1, 2],
            2,
            BlockHashOptions {
                is_eagle: Some(true),
                ..Default::default()
            },
        );
        let expected_second = compute_block_hash_for_seq(
            &[2, 3, 4],
            2,
            BlockHashOptions {
                is_eagle: Some(true),
                ..Default::default()
            },
        );

        assert_eq!(default_hashes.len(), 3);
        assert_eq!(eagle_hashes.len(), 2);
        assert_eq!(eagle_hashes, vec![expected_first[0], expected_second[0]]);
        assert_ne!(default_hashes[0], eagle_hashes[0]);
    }

    #[test]
    fn test_local_block_hash_serialization() {
        let hash = LocalBlockHash(12345);
        let serialized = serde_json::to_string(&hash).unwrap();
        assert_eq!(serialized, "12345");

        let deserialized: LocalBlockHash = serde_json::from_str(&serialized).unwrap();
        assert_eq!(deserialized, hash);
    }

    #[test]
    fn test_external_sequence_block_hash_serialization() {
        let hash = ExternalSequenceBlockHash(67890);
        let serialized = serde_json::to_string(&hash).unwrap();
        assert_eq!(serialized, "67890");

        let deserialized: ExternalSequenceBlockHash = serde_json::from_str(&serialized).unwrap();
        assert_eq!(deserialized, hash);
    }

    #[test]
    fn test_kv_cache_events_serialization() {
        let event_data = KvCacheEventData::Stored(KvCacheStoreData {
            parent_hash: Some(ExternalSequenceBlockHash(1)),
            start_position: None,
            blocks: vec![KvCacheStoredBlockData {
                block_hash: ExternalSequenceBlockHash(2),
                tokens_hash: LocalBlockHash(3),
                mm_extra_info: None,
            }],
        });

        let event = KvCacheEvent {
            event_id: 1,
            data: event_data,
            dp_rank: 0,
        };

        let events = KvCacheEvents {
            events: vec![event],
            shutdown: false,
        };

        let serialized = serde_json::to_string(&events).unwrap();
        let deserialized: KvCacheEvents = serde_json::from_str(&serialized).unwrap();

        assert_eq!(deserialized.events.len(), 1);
        assert_eq!(deserialized.events[0].event_id, 1);
        if let KvCacheEventData::Stored(store_data) = &deserialized.events[0].data {
            assert_eq!(store_data.parent_hash.unwrap().0, 1);
            assert_eq!(store_data.blocks.len(), 1);
            assert_eq!(store_data.blocks[0].block_hash.0, 2);
            assert_eq!(store_data.blocks[0].tokens_hash.0, 3);
        } else {
            panic!("Expected KvCacheEventData::Stored variant");
        }
        assert!(!deserialized.shutdown);
    }

    #[test]
    fn test_kv_cache_remove_data_serialization() {
        let remove_data = KvCacheRemoveData {
            block_hashes: vec![ExternalSequenceBlockHash(4), ExternalSequenceBlockHash(5)],
        };

        let serialized = serde_json::to_string(&remove_data).unwrap();
        let deserialized: KvCacheRemoveData = serde_json::from_str(&serialized).unwrap();

        assert_eq!(deserialized.block_hashes.len(), 2);
        assert_eq!(deserialized.block_hashes[0].0, 4);
        assert_eq!(deserialized.block_hashes[1].0, 5);
    }

    #[test]
    fn test_router_request_mark_free_backwards_compatible_deserialization() {
        let request: RouterRequest = serde_json::from_str(r#"{"method":"mark_free"}"#).unwrap();

        assert!(matches!(
            request,
            RouterRequest::MarkFree { request_id: None }
        ));
    }

    #[test]
    fn test_shared_cache_hits_from_hits() {
        // All hits contiguous
        let hits = SharedCacheHits::from_hits(&[true, true, true, true]);
        assert_eq!(hits.ranges, vec![0..4]);
        assert_eq!(hits.total_hits, 4);

        // Sparse hits
        let hits = SharedCacheHits::from_hits(&[true, false, true, true, false, true]);
        assert_eq!(hits.ranges, vec![0..1, 2..4, 5..6]);
        assert_eq!(hits.total_hits, 4);

        // No hits
        let hits = SharedCacheHits::from_hits(&[false, false, false]);
        assert!(hits.ranges.is_empty());
        assert_eq!(hits.total_hits, 0);

        // Empty
        let hits = SharedCacheHits::from_hits(&[]);
        assert!(hits.ranges.is_empty());
        assert_eq!(hits.total_hits, 0);
    }

    #[test]
    fn test_shared_cache_hits_beyond() {
        // Shared has [A, B, C, D] => range 0..4
        #[allow(clippy::single_range_in_vec_init)]
        let hits = SharedCacheHits::from_ranges(vec![0..4]);

        // Device has overlap=2 (positions 0,1 on device) => shared_beyond should count positions 2,3
        assert_eq!(hits.hits_beyond(2), 2);

        // Device has overlap=0 => all 4 shared hits count
        assert_eq!(hits.hits_beyond(0), 4);

        // Device has overlap=4 => nothing beyond
        assert_eq!(hits.hits_beyond(4), 0);

        // Device overlap exceeds range
        assert_eq!(hits.hits_beyond(10), 0);
    }

    #[test]
    fn test_shared_cache_hits_beyond_sparse() {
        // Ranges: [1..3, 5..8] => positions 1,2,5,6,7
        let hits = SharedCacheHits::from_ranges(vec![1..3, 5..8]);
        assert_eq!(hits.total_hits, 5);

        // from_position=0 => all 5 hits
        assert_eq!(hits.hits_beyond(0), 5);
        // from_position=2 => pos 2 (from first range) + 5,6,7 (from second) = 4
        assert_eq!(hits.hits_beyond(2), 4);
        // from_position=3 => only second range: 3 hits
        assert_eq!(hits.hits_beyond(3), 3);
        // from_position=6 => positions 6,7 from second range = 2
        assert_eq!(hits.hits_beyond(6), 2);
        // from_position=8 => nothing
        assert_eq!(hits.hits_beyond(8), 0);
    }

    #[test]
    fn test_router_request_mark_free_serialization_with_request_id() {
        let request = RouterRequest::MarkFree {
            request_id: Some("req-123".to_string()),
        };

        let serialized = serde_json::to_string(&request).unwrap();
        let deserialized: RouterRequest = serde_json::from_str(&serialized).unwrap();

        assert_eq!(
            serialized,
            r#"{"method":"mark_free","request_id":"req-123"}"#
        );
        assert!(matches!(
            deserialized,
            RouterRequest::MarkFree {
                request_id: Some(ref request_id)
            } if request_id == "req-123"
        ));
    }
}