feat: add KV Event Publishing to vLLM v1 (#1181)

0df6d462 · Alec · GitHub · 93ca9df1 · 0df6d462 · 0df6d462
Unverified Commit 0df6d462 authored May 29, 2025 by Alec Committed by GitHub May 29, 2025
14 changed files
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -269,6 +269,19 @@ dependencies = [
 "zmq",
 ]

+[[package]]
+name = "asynchronous-codec"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a860072022177f903e59730004fb5dc13db9275b79bb2aef7ba8ce831956c233"
+dependencies = [
+ "bytes",
+ "futures-sink",
+ "futures-util",
+ "memchr",
+ "pin-project-lite",
+]
+
 [[package]]
 name = "atomic"
 version = "0.6.0"
@@ -1285,6 +1298,19 @@ dependencies = [
 "syn 2.0.100",
 ]

+[[package]]
+name = "dashmap"
+version = "5.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856"
+dependencies = [
+ "cfg-if 1.0.0",
+ "hashbrown 0.14.5",
+ "lock_api",
+ "once_cell",
+ "parking_lot_core",
+]
+
 [[package]]
 name = "data-encoding"
 version = "2.9.0"
@@ -1606,6 +1632,7 @@ dependencies = [
 "rayon",
 "regex",
 "reqwest",
+ "rmp-serde",
 "rstest 0.18.2",
 "rstest_reuse",
 "sentencepiece",
@@ -1626,6 +1653,7 @@ dependencies = [
 "uuid 1.16.0",
 "validator",
 "xxhash-rust",
+ "zeromq",
 ]

 [[package]]
@@ -2607,6 +2635,12 @@ version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"

+[[package]]
+name = "hashbrown"
+version = "0.14.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
+
 [[package]]
 name = "hashbrown"
 version = "0.15.2"
@@ -5275,6 +5309,28 @@ dependencies = [
 "windows-sys 0.52.0",
 ]

+[[package]]
+name = "rmp"
+version = "0.8.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "228ed7c16fa39782c3b3468e974aec2795e9089153cd08ee2e9aefb3613334c4"
+dependencies = [
+ "byteorder",
+ "num-traits",
+ "paste",
+]
+
+[[package]]
+name = "rmp-serde"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52e599a477cf9840e92f2cde9a7189e67b42c57532749bf90aea6ec10facd4db"
+dependencies = [
+ "byteorder",
+ "rmp",
+ "serde",
+]
+
 [[package]]
 name = "router"
 version = "0.2.1"
@@ -6514,6 +6570,7 @@ checksum = "66a539a9ad6d5d281510d5bd368c973d636c02dbf8a67300bfb6b950696ad7df"
 dependencies = [
 "bytes",
 "futures-core",
+ "futures-io",
 "futures-sink",
 "pin-project-lite",
 "tokio",
@@ -7783,6 +7840,33 @@ version = "1.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde"

+[[package]]
+name = "zeromq"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a4528179201f6eecf211961a7d3276faa61554c82651ecc66387f68fc3004bd"
+dependencies = [
+ "async-trait",
+ "asynchronous-codec",
+ "bytes",
+ "crossbeam-queue",
+ "dashmap",
+ "futures-channel",
+ "futures-io",
+ "futures-task",
+ "futures-util",
+ "log",
+ "num-traits",
+ "once_cell",
+ "parking_lot",
+ "rand 0.8.5",
+ "regex",
+ "thiserror 1.0.69",
+ "tokio",
+ "tokio-util",
+ "uuid 1.16.0",
+]
+
 [[package]]
 name = "zeromq-src"
 version = "0.2.6+4.3.4"

--- a/components/metrics/src/bin/mock_worker.rs
+++ b/components/metrics/src/bin/mock_worker.rs
@@ -115,6 +115,7 @@ fn mock_stats_handler(_stats: EndpointStats) -> serde_json::Value {
    let gpu_cache_usage_perc = rand::rng().random_range(0.0..=1.0);
    let gpu_prefix_cache_hit_rate = rand::rng().random_range(0.0..=1.0);
    let stats = ForwardPassMetrics {
+        data_parallel_rank: None, // Default for backwards compatibility
        request_active_slots,
        request_total_slots,
        kv_active_blocks,

--- a/launch/dynamo-run/src/subprocess/vllm_v1_inc.py
+++ b/launch/dynamo-run/src/subprocess/vllm_v1_inc.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# `dynamo-run out=vllm` runs this script
+# Can also be used standalone: `python3 vllm_inc.py` - lots of optional cmd line params
+
+# Setup checklist:
+# - We are in a virtualenv with vllm installed. Must be newer than v0.9.0 (currently pre-release)
+# 1f079540db5f1080a2f61a730da50d3009934c5a - this commit is working for me
+# Steps:
+# git clone https://github.com/vllm-project/vllm.git
+# cd vllm && git checkout 1f079540db5f1080a2f61a730da50d3009934c5a
+# uv pip uninstall ai-dynamo-vllm
+# VLLM_USE_PRECOMPILED=1 uv pip install --editable .
+
+import argparse
+import asyncio
+import json
+import logging
+import os
+import sys
+import uuid
+from typing import Optional
+
+import uvloop
+from vllm.config import VllmConfig
+from vllm.distributed.kv_events import KVEventsConfig
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.inputs import TokensPrompt
+from vllm.sampling_params import SamplingParams
+from vllm.usage.usage_lib import UsageContext
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.metrics.loggers import StatLoggerBase
+from vllm.v1.metrics.stats import IterationStats, SchedulerStats
+
+from dynamo.llm import (
+    KvEventPublisherFromZmq,
+    KvEventPublisherFromZmqConfig,
+    KvMetricsPublisher,
+    ModelType,
+    register_llm,
+)
+from dynamo.runtime import Component, DistributedRuntime, dynamo_worker
+
+# Only used if you run it manually from the command line
+DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate"
+DEFAULT_MODEL = "Qwen/Qwen3-0.6B"
+
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
+
+class Config:
+    """Command line parameters or defaults"""
+
+    namespace: str
+    component: str
+    endpoint: str
+    model_path: str
+    model_name: Optional[str]
+    tensor_parallel_size: int
+    kv_block_size: int
+    context_length: int
+    extra_engine_args: str
+
+
+class DynamoStatLoggerPublisher(StatLoggerBase):
+    """Stat logger publisher. Wrapper for the KvMetricsPublisher to match the StatLoggerBase interface."""
+
+    def __init__(self, component: Component, dp_rank: int) -> None:
+        self.inner = KvMetricsPublisher()
+        self.inner.create_endpoint(component)
+        self.dp_rank = dp_rank
+
+    def record(
+        self, scheduler_stats: SchedulerStats, iteration_stats: Optional[IterationStats]
+    ):
+        # request_total_slots and kv_total_blocks are properties of model + gpu
+        # we should only publish them once, not every metric update
+        # they should be part of some runtime metadata tied to MDC or put in etcd ?
+        hit_rate = 0
+        if scheduler_stats.prefix_cache_stats.queries > 0:
+            hit_rate = (
+                scheduler_stats.prefix_cache_stats.hits
+                / scheduler_stats.prefix_cache_stats.queries
+            )
+
+        # TODO Manage DP Ranks in metrics aggregation.
+        self.inner.publish(
+            request_active_slots=scheduler_stats.num_running_reqs,
+            request_total_slots=0,  # TODO - remove from metrics
+            kv_active_blocks=0,  # TODO - need to calculate this
+            kv_total_blocks=0,  # TODO - remove from metrics
+            num_requests_waiting=scheduler_stats.num_waiting_reqs,  # used in current cost function
+            gpu_cache_usage_perc=scheduler_stats.gpu_cache_usage,  # used in current cost function
+            gpu_prefix_cache_hit_rate=hit_rate,
+            data_parallel_rank=self.dp_rank,
+        )
+
+    def log_engine_initialized(self) -> None:
+        pass
+
+
+class StatLoggerFactory:
+    """Factory for creating stat logger publishers. Required by vLLM."""
+
+    def __init__(self, component: Component) -> None:
+        self.component = component
+
+    def create_stat_logger(self, dp_rank: int) -> StatLoggerBase:
+        return DynamoStatLoggerPublisher(self.component, dp_rank)
+
+    def __call__(self, vllm_config: VllmConfig, dp_rank: int) -> StatLoggerBase:
+        return self.create_stat_logger(dp_rank=dp_rank)
+
+
+class RequestHandler:
+    """
+    Request handler for the generate endpoint
+    """
+
+    def __init__(self, component, engine, default_sampling_params):
+        self.component = component
+        self.engine_client = engine
+        self.default_sampling_params = default_sampling_params
+
+    async def generate(self, request):
+        request_id = str(uuid.uuid4().hex)
+
+        prompt = TokensPrompt(prompt_token_ids=request["token_ids"])
+
+        sampling_params = SamplingParams(**self.default_sampling_params)
+        for key, value in request["sampling_options"].items():
+            if not value:
+                continue
+            if hasattr(sampling_params, key):
+                setattr(sampling_params, key, value)
+
+        max_tokens = request["stop_conditions"]["max_tokens"]
+        if max_tokens:
+            sampling_params.max_tokens = max_tokens
+
+        num_output_tokens_so_far = 0
+        gen = self.engine_client.generate(prompt, sampling_params, request_id)
+        async for res in gen:
+            # res is vllm's RequestOutput
+
+            # This is the expected way for a request to end.
+            # The new token ID will be eos, don't forward it.
+            if res.finished:
+                yield {"finish_reason": "stop", "token_ids": []}
+                break
+
+            if not res.outputs:
+                yield {"finish_reason": "error", "token_ids": []}
+                break
+
+            output = res.outputs[0]
+            next_total_toks = len(output.token_ids)
+            out = {"token_ids": output.token_ids[num_output_tokens_so_far:]}
+            if output.finish_reason:
+                out["finish_reason"] = output.finish_reason
+            if output.stop_reason:
+                out["stop_reason"] = output.stop_reason
+            yield out
+            num_output_tokens_so_far = next_total_toks
+
+
+@dynamo_worker(static=False)
+async def worker(runtime: DistributedRuntime):
+    await init(runtime, cmd_line_args())
+
+
+async def init(runtime: DistributedRuntime, config: Config):
+    """
+    Instantiate and serve
+    """
+    component = runtime.namespace(config.namespace).component(config.component)
+    await component.create_service()
+
+    endpoint = component.endpoint(config.endpoint)
+    await register_llm(
+        ModelType.Backend,
+        endpoint,
+        config.model_path,
+        config.model_name,
+        kv_cache_block_size=config.kv_block_size,
+    )
+
+    arg_map = {
+        "model": config.model_path,
+        "task": "generate",
+        "tensor_parallel_size": config.tensor_parallel_size,
+        "skip_tokenizer_init": True,
+        "disable_log_requests": True,
+        "enable_prefix_caching": True,
+        # KV routing relies on logging KV metrics
+        "disable_log_stats": False,
+        "kv_events_config": KVEventsConfig(
+            enable_kv_cache_events=True, publisher="zmq"
+        ),
+    }
+
+    if config.context_length:
+        # Usually we want it to default to the max (from tokenizer_config.json)
+        arg_map["max_model_len"] = config.context_length
+
+    if config.kv_block_size > 0:
+        arg_map["block_size"] = config.kv_block_size
+
+    if config.extra_engine_args != "":
+        json_map = {}
+        # extra_engine_args is a filename
+        try:
+            with open(config.extra_engine_args) as f:
+                json_map = json.load(f)
+        except FileNotFoundError:
+            logging.error(f"File {config.extra_engine_args} not found.")
+        except json.JSONDecodeError as e:
+            logging.error(f"Invalid JSON in {config.extra_engine_args}: {e}")
+        logging.debug(f"Adding extra engine arguments: {json_map}")
+        arg_map = {**arg_map, **json_map}  # json_map gets precedence
+
+    logger.info(f"VLLM config: {arg_map}")
+
+    os.environ["VLLM_NO_USAGE_STATS"] = "1"  # Avoid internal HTTP requests
+    os.environ[
+        "VLLM_WORKER_MULTIPROC_METHOD"
+    ] = "spawn"  # Ensure our publisher makes it to the new process
+
+    engine_args = AsyncEngineArgs(**arg_map)
+    model_config = engine_args.create_model_config()
+    # Load default sampling params from `generation_config.json`
+    default_sampling_params = model_config.get_diff_sampling_param()
+
+    # Taken from build_async_engine_client_from_engine_args()
+    usage_context = UsageContext.OPENAI_API_SERVER
+    vllm_config = engine_args.create_engine_config(usage_context=usage_context)
+
+    # Explicitly pass our custom stat logger for metrics
+    engine_client = AsyncLLM.from_vllm_config(
+        vllm_config=vllm_config,
+        usage_context=usage_context,
+        stat_loggers=[StatLoggerFactory(component)],
+        disable_log_requests=engine_args.disable_log_requests,
+        disable_log_stats=engine_args.disable_log_stats,
+    )
+
+    logger.info("VllmWorker has been initialized")
+
+    zmq_config = KvEventPublisherFromZmqConfig(
+        worker_id=endpoint.lease_id(), kv_block_size=engine_args.block_size
+    )
+
+    _ = KvEventPublisherFromZmq(component=component, config=zmq_config)
+
+    handler = RequestHandler(component, engine_client, default_sampling_params)
+
+    # the server will gracefully shutdown (i.e., keep opened TCP streams finishes)
+    # after the lease is revoked
+    await endpoint.serve_endpoint(handler.generate)
+
+
+def cmd_line_args():
+    parser = argparse.ArgumentParser(
+        description="vLLM server integrated with Dynamo LLM."
+    )
+    parser.add_argument(
+        "--endpoint",
+        type=str,
+        default=DEFAULT_ENDPOINT,
+        help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
+    )
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        default=DEFAULT_MODEL,
+        help=f"Path to disk model or HuggingFace model identifier to load. Default: {DEFAULT_MODEL}",
+    )
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="",
+        help="Name to serve the model under. Defaults to deriving it from model path.",
+    )
+    parser.add_argument(
+        "--tensor-parallel-size", type=int, default=1, help="Number of GPUs to use."
+    )
+    parser.add_argument(
+        "--kv-block-size", type=int, default=16, help="Size of a KV cache block."
+    )
+    parser.add_argument(
+        "--context-length",
+        type=int,
+        default=None,
+        help="Max model context length. Defaults to models max, usually model_max_length from tokenizer_config.json. Reducing this reduces VRAM requirements.",
+    )
+    parser.add_argument(
+        "--extra-engine-args",
+        type=str,
+        default="",
+        help="Path to a JSON file containing additional keyword arguments to pass to the vLLM AsyncLLMEngine.",
+    )
+    args = parser.parse_args()
+
+    config = Config()
+    config.model_path = args.model_path
+    if args.model_name:
+        config.model_name = args.model_name
+    else:
+        # This becomes an `Option` on the Rust side
+        config.model_name = None
+
+    endpoint_str = args.endpoint.replace("dyn://", "", 1)
+    endpoint_parts = endpoint_str.split(".")
+    if len(endpoint_parts) != 3:
+        logging.error(
+            f"Invalid endpoint format: '{args.endpoint}'. Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'."
+        )
+        sys.exit(1)
+
+    parsed_namespace, parsed_component_name, parsed_endpoint_name = endpoint_parts
+
+    config.namespace = parsed_namespace
+    config.component = parsed_component_name
+    config.endpoint = parsed_endpoint_name
+    config.tensor_parallel_size = args.tensor_parallel_size
+    config.kv_block_size = args.kv_block_size
+    config.context_length = args.context_length
+    config.extra_engine_args = args.extra_engine_args
+
+    return config
+
+
+if __name__ == "__main__":
+    uvloop.install()
+    asyncio.run(worker())
--- a/lib/bindings/python/Cargo.lock
+++ b/lib/bindings/python/Cargo.lock
@@ -250,6 +250,19 @@ dependencies = [
 "zmq",
 ]

+[[package]]
+name = "asynchronous-codec"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a860072022177f903e59730004fb5dc13db9275b79bb2aef7ba8ce831956c233"
+dependencies = [
+ "bytes",
+ "futures-sink",
+ "futures-util",
+ "memchr",
+ "pin-project-lite",
+]
+
 [[package]]
 name = "atomic"
 version = "0.6.0"
@@ -887,6 +900,19 @@ dependencies = [
 "syn 2.0.100",
 ]

+[[package]]
+name = "dashmap"
+version = "5.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856"
+dependencies = [
+ "cfg-if 1.0.0",
+ "hashbrown 0.14.5",
+ "lock_api",
+ "once_cell",
+ "parking_lot_core",
+]
+
 [[package]]
 name = "data-encoding"
 version = "2.9.0"
@@ -1106,6 +1132,7 @@ dependencies = [
 "rand 0.9.1",
 "rayon",
 "regex",
+ "rmp-serde",
 "serde",
 "serde_json",
 "strum",
@@ -1123,6 +1150,7 @@ dependencies = [
 "uuid",
 "validator",
 "xxhash-rust",
+ "zeromq",
 ]

 [[package]]
@@ -1909,6 +1937,12 @@ version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"

+[[package]]
+name = "hashbrown"
+version = "0.14.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
+
 [[package]]
 name = "hashbrown"
 version = "0.15.2"
@@ -3818,6 +3852,28 @@ dependencies = [
 "windows-sys 0.52.0",
 ]

+[[package]]
+name = "rmp"
+version = "0.8.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "228ed7c16fa39782c3b3468e974aec2795e9089153cd08ee2e9aefb3613334c4"
+dependencies = [
+ "byteorder",
+ "num-traits",
+ "paste",
+]
+
+[[package]]
+name = "rmp-serde"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52e599a477cf9840e92f2cde9a7189e67b42c57532749bf90aea6ec10facd4db"
+dependencies = [
+ "byteorder",
+ "rmp",
+ "serde",
+]
+
 [[package]]
 name = "rustc-demangle"
 version = "0.1.24"
@@ -4601,6 +4657,7 @@ checksum = "66a539a9ad6d5d281510d5bd368c973d636c02dbf8a67300bfb6b950696ad7df"
 dependencies = [
 "bytes",
 "futures-core",
+ "futures-io",
 "futures-sink",
 "pin-project-lite",
 "tokio",
@@ -5692,6 +5749,33 @@ version = "1.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde"

+[[package]]
+name = "zeromq"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a4528179201f6eecf211961a7d3276faa61554c82651ecc66387f68fc3004bd"
+dependencies = [
+ "async-trait",
+ "asynchronous-codec",
+ "bytes",
+ "crossbeam-queue",
+ "dashmap",
+ "futures-channel",
+ "futures-io",
+ "futures-task",
+ "futures-util",
+ "log",
+ "num-traits",
+ "once_cell",
+ "parking_lot",
+ "rand 0.8.5",
+ "regex",
+ "thiserror 1.0.69",
+ "tokio",
+ "tokio-util",
+ "uuid",
+]
+
 [[package]]
 name = "zeromq-src"
 version = "0.2.6+4.3.4"

--- a/lib/bindings/python/rust/lib.rs
+++ b/lib/bindings/python/rust/lib.rs
@@ -61,6 +61,8 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
    m.add_class::<llm::kv::AggregatedMetrics>()?;
    m.add_class::<llm::kv::KvMetricsAggregator>()?;
    m.add_class::<llm::kv::KvEventPublisher>()?;
+    m.add_class::<llm::kv::KvEventPublisherFromZmq>()?;
+    m.add_class::<llm::kv::KvEventPublisherFromZmqConfig>()?;
    m.add_class::<llm::kv::KvRecorder>()?;
    m.add_class::<llm::nats::NatsQueue>()?;
    m.add_class::<http::HttpService>()?;

--- a/lib/bindings/python/rust/llm/kv.rs
+++ b/lib/bindings/python/rust/llm/kv.rs
@@ -14,13 +14,15 @@
 // limitations under the License.

 use std::collections::HashMap;
+use std::sync::atomic::AtomicU32;

 use super::*;
 use llm_rs::kv_router::indexer::KvIndexerInterface;
 use rs::traits::events::EventSubscriber;
 use tracing;

-use llm_rs::kv_router::{indexer::compute_block_hash_for_seq, protocols::*};
+use llm_rs::kv_router::protocols::*;
+use llm_rs::kv_router::publisher::create_stored_blocks;

 #[pyclass]
 pub(crate) struct KvRouter {
@@ -93,6 +95,7 @@ impl KvMetricsPublisher {
    }

    #[allow(clippy::too_many_arguments)]
+    #[pyo3(signature = (request_active_slots, request_total_slots, kv_active_blocks, kv_total_blocks, num_requests_waiting, gpu_cache_usage_perc, gpu_prefix_cache_hit_rate, data_parallel_rank = 0))]
    fn publish(
        &self,
        _py: Python,
@@ -103,10 +106,12 @@ impl KvMetricsPublisher {
        num_requests_waiting: u64,
        gpu_cache_usage_perc: f32,
        gpu_prefix_cache_hit_rate: f32,
+        data_parallel_rank: u32,
    ) -> PyResult<()> {
        self.inner
            .publish(
                llm_rs::kv_router::protocols::ForwardPassMetrics {
+                    data_parallel_rank: Some(data_parallel_rank),
                    request_active_slots,
                    request_total_slots,
                    kv_active_blocks,
@@ -121,10 +126,73 @@ impl KvMetricsPublisher {
    }
 }

+#[pyclass]
+#[derive(Clone)]
+pub struct KvEventPublisherFromZmqConfig {
+    #[pyo3(get, set)]
+    pub worker_id: i64,
+    #[pyo3(get, set)]
+    pub kv_block_size: usize,
+    #[pyo3(get, set)]
+    pub zmq_endpoint: String,
+    #[pyo3(get, set)]
+    pub zmq_topic: String,
+}
+
+#[pymethods]
+impl KvEventPublisherFromZmqConfig {
+    #[new]
+    #[pyo3(signature = (
+        worker_id,
+        kv_block_size,
+        zmq_endpoint = "tcp://127.0.0.1:5557".to_string(),
+        zmq_topic = "".to_string()
+    ))]
+    pub fn new(
+        worker_id: i64,
+        kv_block_size: usize,
+        zmq_endpoint: String,
+        zmq_topic: String,
+    ) -> Self {
+        Self {
+            worker_id,
+            kv_block_size,
+            zmq_endpoint,
+            zmq_topic,
+        }
+    }
+}
+
+#[pyclass]
+pub(crate) struct KvEventPublisherFromZmq {
+    inner: llm_rs::kv_router::publisher::KvEventPublisherFromZmq,
+}
+
+#[pymethods]
+impl KvEventPublisherFromZmq {
+    #[new]
+    fn new(component: Component, config: KvEventPublisherFromZmqConfig) -> PyResult<Self> {
+        let mut inner =
+            llm_rs::kv_router::publisher::KvEventPublisherFromZmq::new(config.kv_block_size);
+        inner.start_background_task(
+            component.inner,
+            config.worker_id,
+            config.zmq_endpoint,
+            config.zmq_topic,
+        );
+        Ok(Self { inner })
+    }
+
+    fn shutdown(&mut self) {
+        self.inner.shutdown()
+    }
+}
+
 #[pyclass]
 pub(crate) struct KvEventPublisher {
    inner: Arc<llm_rs::kv_router::publisher::KvEventPublisher>,
-    warning_count: u32,
+    kv_block_size: usize,
+    warning_count: Arc<AtomicU32>,
 }

 #[pymethods]
@@ -132,14 +200,15 @@ impl KvEventPublisher {
    #[new]
    fn new(component: Component, worker_id: i64, kv_block_size: usize) -> PyResult<Self> {
        let inner = llm_rs::kv_router::publisher::KvEventPublisher::new(
-            component.inner.clone(),
+            component.inner,
            worker_id,
            kv_block_size,
        )
        .map_err(to_pyerr)?;
        Ok(Self {
            inner: inner.into(),
-            warning_count: 0,
+            kv_block_size,
+            warning_count: Arc::new(AtomicU32::new(0)),
        })
    }

@@ -151,19 +220,21 @@ impl KvEventPublisher {
        event_id: u64,
        token_ids: Vec<u32>,
        num_block_tokens: Vec<u64>,
-        block_hashes: Vec<u64>,
+        block_hashes: Vec<i64>,
        lora_id: u64,
-        parent_hash: Option<u64>,
+        parent_hash: Option<i64>,
    ) -> PyResult<()> {
        let event = KvCacheEvent {
            event_id,
            data: KvCacheEventData::Stored(KvCacheStoreData {
-                parent_hash: parent_hash.map(ExternalSequenceBlockHash),
-                blocks: self.create_stored_blocks(
+                parent_hash: parent_hash.map(ExternalSequenceBlockHash::from),
+                blocks: create_stored_blocks(
+                    self.kv_block_size,
                    &token_ids,
                    &num_block_tokens,
                    &block_hashes,
                    lora_id,
+                    &self.warning_count,
                ),
            }),
        };
@@ -171,10 +242,10 @@ impl KvEventPublisher {
        self.inner.publish(event).map_err(to_pyerr)
    }

-    fn publish_removed(&self, _py: Python, event_id: u64, block_hashes: Vec<u64>) -> PyResult<()> {
+    fn publish_removed(&self, _py: Python, event_id: u64, block_hashes: Vec<i64>) -> PyResult<()> {
        let block_hashes: Vec<ExternalSequenceBlockHash> = block_hashes
            .iter()
-            .map(|&v| ExternalSequenceBlockHash(v))
+            .map(|&h| ExternalSequenceBlockHash::from(h))
            .collect();
        let event = KvCacheEvent {
            event_id,
@@ -185,50 +256,6 @@ impl KvEventPublisher {
    }
 }

-impl KvEventPublisher {
-    fn create_stored_block_from_parts(
-        &self,
-        block_hash: u64,
-        token_ids: &[u32],
-        _lora_id: u64,
-    ) -> KvCacheStoredBlockData {
-        let tokens_hash = compute_block_hash_for_seq(token_ids, self.inner.kv_block_size())[0];
-        KvCacheStoredBlockData {
-            block_hash: ExternalSequenceBlockHash(block_hash),
-            tokens_hash,
-        }
-    }
-
-    fn create_stored_blocks(
-        &mut self,
-        token_ids: &[u32],
-        num_block_tokens: &[u64],
-        block_hashes: &[u64],
-        lora_id: u64,
-    ) -> Vec<KvCacheStoredBlockData> {
-        let mut blocks: Vec<KvCacheStoredBlockData> = Vec::new();
-
-        let mut token_offset: usize = 0;
-        for (num_tokens_it, block_hash_it) in num_block_tokens.iter().zip(block_hashes.iter()) {
-            if (self.warning_count < 3) && (*num_tokens_it != self.inner.kv_block_size() as u64) {
-                tracing::warn!(
-                    "Block not published. Block size must be {} tokens to be published. Block size is: {}",
-                    self.inner.kv_block_size(),
-                    *num_tokens_it
-                );
-                self.warning_count += 1;
-                break;
-            }
-
-            let tokens = &token_ids[token_offset..(token_offset + *num_tokens_it as usize)];
-            blocks.push(self.create_stored_block_from_parts(*block_hash_it, tokens, lora_id));
-            token_offset += *num_tokens_it as usize;
-        }
-
-        blocks
-    }
-}
-
 #[pyclass]
 #[derive(Clone)]
 pub(crate) struct OverlapScores {

--- a/lib/bindings/python/src/dynamo/_core.pyi
+++ b/lib/bindings/python/src/dynamo/_core.pyi
@@ -368,6 +368,10 @@ class KvMetricsPublisher:
        request_total_slots: int,
        kv_active_blocks: int,
        kv_total_blocks: int,
+        num_requests_waiting: int,
+        gpu_cache_usage_perc: float,
+        gpu_prefix_cache_hit_rate: float,
+        data_parallel_rank: int = 0,
    ) -> None:
        """
        Update the KV metrics being reported.
@@ -575,6 +579,40 @@ class KvEventPublisher:
        """
        ...

+class KvEventPublisherFromZmqConfig:
+    def __init__(
+        self,
+        worker_id: int,
+        kv_block_size: int,
+        zmq_endpoint: str = "tcp://127.0.0.1:5557",
+        zmq_topic: str = ""
+    ) -> None:
+        """
+        Configuration for the KvEventPublisherFromZmq.
+
+        :param worker_id: The worker ID.
+        :param kv_block_size: The block size for the key-value store.
+        :param zmq_endpoint: The ZeroMQ endpoint. Defaults to "tcp://127.0.0.1:5557".
+        :param zmq_topic: The ZeroMQ topic to subscribe to. Defaults to an empty string.
+        """
+        ...
+
+class KvEventPublisherFromZmq:
+    def __init__(self, component: Component, config: KvEventPublisherFromZmqConfig) -> None:
+        """
+        Initializes a new KvEventPublisherFromZmq instance.
+
+        :param component: The component to be used.
+        :param config: Configuration for the event publisher.
+        """
+        ...
+
+    def shutdown(self) -> None:
+        """
+        Shuts down the event publisher, stopping any background tasks.
+        """
+        ...
+
 class HttpService:
    """
    A HTTP service for dynamo applications.

--- a/lib/bindings/python/src/dynamo/llm/__init__.py
+++ b/lib/bindings/python/src/dynamo/llm/__init__.py
@@ -25,6 +25,8 @@ from dynamo._core import HttpAsyncEngine as HttpAsyncEngine
 from dynamo._core import HttpError as HttpError
 from dynamo._core import HttpService as HttpService
 from dynamo._core import KvEventPublisher as KvEventPublisher
+from dynamo._core import KvEventPublisherFromZmq as KvEventPublisherFromZmq
+from dynamo._core import KvEventPublisherFromZmqConfig as KvEventPublisherFromZmqConfig
 from dynamo._core import KvIndexer as KvIndexer
 from dynamo._core import KvMetricsAggregator as KvMetricsAggregator
 from dynamo._core import KvMetricsPublisher as KvMetricsPublisher

--- a/lib/llm/Cargo.toml
+++ b/lib/llm/Cargo.toml
@@ -116,6 +116,10 @@ minijinja-contrib = { version = "2.10.2", features = ["pycompat"] }
 ggus = "0.4.0"
 memmap2 = "0.9.5"

+# Publishers
+zeromq = "0.4.1"
+rmp-serde = "1.3"
+
 [dev-dependencies]
 assert_matches = "1.5"
 hf-hub = { workspace = true }

--- a/lib/llm/src/kv_router.rs
+++ b/lib/llm/src/kv_router.rs
@@ -73,7 +73,7 @@ impl KvRouter {
            .primary_lease()
            .expect("Cannot KV route static workers")
            .primary_token();
-
+        tracing::info!("KV Routing initialized");
        let metrics_aggregator =
            KvMetricsAggregator::new(component.clone(), cancellation_token.clone()).await;
        let indexer = KvIndexer::new(cancellation_token.clone(), block_size);

--- a/lib/llm/src/kv_router/protocols.rs
+++ b/lib/llm/src/kv_router/protocols.rs
@@ -41,6 +41,7 @@ pub struct WorkerSelectionResult {

 #[derive(Debug, Clone, Serialize, Deserialize, Default)]
 pub struct ForwardPassMetrics {
+    pub data_parallel_rank: Option<u32>, // backwards compatible
    pub request_active_slots: u64,
    pub request_total_slots: u64,
    pub kv_active_blocks: u64,
@@ -65,6 +66,21 @@ pub struct LocalBlockHash(pub u64);
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Ord, PartialOrd)]
 pub struct ExternalSequenceBlockHash(pub u64);

+// Implement From trait for convenient conversion
+impl From<u64> for ExternalSequenceBlockHash {
+    fn from(value: u64) -> Self {
+        Self(value)
+    }
+}
+
+impl From<i64> for ExternalSequenceBlockHash {
+    /// Bitwise reinterpretation: preserves all bits, including negatives.
+    /// This is lossless, but negative i64 values will appear as large u64 values.
+    fn from(value: i64) -> Self {
+        Self(value as u64)
+    }
+}
+
 /// Represents a collection of cache events and a shutdown flag.
 #[derive(Serialize, Deserialize, Debug, Clone)]
 pub struct KvCacheEvents {

--- a/lib/llm/src/kv_router/publisher.rs
+++ b/lib/llm/src/kv_router/publisher.rs
@@ -13,9 +13,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use crate::kv_router::{indexer::RouterEvent, protocols::*, KV_EVENT_SUBJECT, KV_METRICS_ENDPOINT};
+use crate::kv_router::{
+    indexer::{compute_block_hash_for_seq, RouterEvent},
+    protocols::*,
+    KV_EVENT_SUBJECT, KV_METRICS_ENDPOINT,
+};
 use async_trait::async_trait;
-use dynamo_runtime::traits::{events::EventPublisher, DistributedRuntimeProvider};
+use dynamo_runtime::traits::{events::EventPublisher, DistributedRuntimeProvider, RuntimeProvider};
 use dynamo_runtime::{
    component::Component,
    pipeline::{
@@ -29,9 +33,20 @@ use futures::stream;
 use std::sync::Arc;
 use tokio::sync::mpsc;

+use rmp_serde as rmps;
+use serde::Deserialize;
+use serde::Serialize;
+use std::sync::atomic::{AtomicU32, Ordering};
+use std::time::Duration;
+use zeromq::{Socket, SocketRecv, SubSocket};
+
+// -------------------------------------------------------------------------
+// KV Event Publishers -----------------------------------------------------
+// -------------------------------------------------------------------------
+
 pub struct KvEventPublisher {
-    tx: mpsc::UnboundedSender<KvCacheEvent>,
    kv_block_size: usize,
+    tx: mpsc::UnboundedSender<KvCacheEvent>,
 }

 impl KvEventPublisher {
@@ -44,7 +59,7 @@ impl KvEventPublisher {
    }

    pub fn publish(&self, event: KvCacheEvent) -> Result<(), mpsc::error::SendError<KvCacheEvent>> {
-        tracing::debug!("Publish event: {:?}", event);
+        tracing::trace!("Publish event: {:?}", event);
        self.tx.send(event)
    }

@@ -72,6 +87,373 @@ fn start_publish_task(
    });
 }

+// vLLM and SGLang use multi-processing to launch engine-core processes
+// We use zmq to publish events from these processes to a socket
+// For more info on zmq: https://zeromq.org/
+// This publisher reads those events and publishes them to NATS
+// The indexer will get the events from NATS and put them in the global prefix tree.
+pub struct KvEventPublisherFromZmq {
+    kv_block_size: usize,
+    processor_handle: Option<tokio::task::JoinHandle<()>>,
+    zmq_handle: Option<tokio::task::JoinHandle<()>>,
+    zmq_token: Option<dynamo_runtime::CancellationToken>,
+    warning_count: Arc<AtomicU32>,
+}
+
+impl KvEventPublisherFromZmq {
+    pub fn new(kv_block_size: usize) -> Self {
+        Self {
+            kv_block_size,
+            processor_handle: None,
+            zmq_handle: None,
+            zmq_token: None,
+            warning_count: Arc::new(AtomicU32::new(0)),
+        }
+    }
+
+    pub fn start_background_task(
+        &mut self,
+        component: Component,
+        worker_id: i64,
+        zmq_endpoint: String,
+        zmq_topic: String,
+    ) {
+        let kv_block_size = self.kv_block_size;
+        let warning_count = self.warning_count.clone();
+        let (raw_tx, raw_rx) = mpsc::unbounded_channel::<(u64, Vec<u8>)>();
+
+        let zmq_token = component.rt().child_token();
+        self.zmq_token = Some(zmq_token.clone());
+
+        // Spawn async ZMQ listener
+        self.zmq_handle = Some(
+            component
+                .drt()
+                .runtime()
+                .secondary()
+                .spawn(start_zmq_listener(
+                    zmq_endpoint,
+                    zmq_topic,
+                    raw_tx,
+                    zmq_token.clone(),
+                )),
+        );
+
+        self.processor_handle = Some(component.drt().runtime().secondary().spawn(
+            start_event_processor(
+                raw_rx,
+                component,
+                worker_id,
+                kv_block_size,
+                warning_count,
+                zmq_token,
+            ),
+        ));
+    }
+
+    pub fn shutdown(&mut self) {
+        if let Some(token) = self.zmq_token.take() {
+            token.cancel();
+        }
+        if let Some(handle) = self.zmq_handle.take() {
+            handle.abort();
+        }
+        if let Some(handle) = self.processor_handle.take() {
+            handle.abort();
+        }
+    }
+}
+
+async fn start_event_processor<P: EventPublisher>(
+    mut raw_rx: mpsc::UnboundedReceiver<(u64, Vec<u8>)>,
+    component: P,
+    worker_id: i64,
+    kv_block_size: usize,
+    warning_count: Arc<AtomicU32>,
+    cancellation_token: dynamo_runtime::CancellationToken,
+) {
+    loop {
+        tokio::select! {
+            biased;
+
+            // Check for cancellation
+            _ = cancellation_token.cancelled() => {
+                tracing::debug!("Event processor received cancellation signal");
+                break;
+            }
+
+            // Process incoming messages
+            msg = raw_rx.recv() => {
+                let Some((seq, payload)) = msg else {
+                    tracing::debug!("Event processor channel closed");
+                    break;
+                };
+
+                let batch_result = rmps::from_slice::<KvEventBatch>(&payload);
+                let Ok(batch) = batch_result else {
+                    let e = batch_result.unwrap_err();
+                    tracing::warn!(error=%e, "Failed to decode KVEventBatch msgpack");
+                    continue;
+                };
+
+                for raw_evt in batch.events.into_iter() {
+                    let Some(event) = convert_event(raw_evt, seq, kv_block_size, &warning_count) else {
+                        // Case where convert_event returns None
+                        continue;
+                    };
+
+                    let router_event = RouterEvent::new(worker_id, event);
+                    if let Err(e) = component.publish(KV_EVENT_SUBJECT, &router_event).await {
+                        tracing::warn!(error=%e, "Failed to publish router event.");
+                    }
+                }
+            }
+        }
+    }
+    tracing::debug!("Event processor exiting");
+}
+
+// Error handling configuration for ZMQ operations
+const INITIAL_BACKOFF_MS: u64 = 10;
+const MAX_BACKOFF_MS: u64 = 5000;
+const MAX_CONSECUTIVE_ERRORS: u32 = 10;
+const MAX_BACKOFF_EXPONENT: u32 = 8; // Cap at 2^8 = 256x multiplier to prevent overflow
+
+/// Calculate exponential backoff duration based on consecutive error count
+fn calculate_backoff_ms(consecutive_errors: u32) -> u64 {
+    std::cmp::min(
+        INITIAL_BACKOFF_MS * 2_u64.pow(consecutive_errors.min(MAX_BACKOFF_EXPONENT)),
+        MAX_BACKOFF_MS,
+    )
+}
+
+async fn start_zmq_listener(
+    zmq_endpoint: String,
+    zmq_topic: String,
+    raw_tx: mpsc::UnboundedSender<(u64, Vec<u8>)>,
+    zmq_token: dynamo_runtime::CancellationToken,
+) {
+    tracing::debug!(
+        "KVEventPublisher connecting to ZMQ endpoint {} (topic '{}')",
+        zmq_endpoint,
+        zmq_topic
+    );
+
+    let mut socket = SubSocket::new();
+
+    // Subscribe to the requested topic (empty string == all topics)
+    if let Err(e) = socket.subscribe(&zmq_topic).await {
+        tracing::error!("Failed to subscribe on ZMQ socket: {}", e);
+        return;
+    }
+
+    if let Err(e) = socket.connect(&zmq_endpoint).await {
+        tracing::error!("Failed to connect ZMQ SUB socket: {}", e);
+        return;
+    }
+
+    let mut consecutive_errors = 0u32;
+
+    loop {
+        tokio::select! {
+            biased;
+
+            // Check for cancellation
+            _ = zmq_token.cancelled() => {
+                tracing::info!("ZMQ listener received cancellation signal");
+                break;
+            }
+
+            // Receive message
+            msg_result = socket.recv() => {
+                let Ok(msg) = msg_result else {
+                    let e = msg_result.unwrap_err();
+                    consecutive_errors += 1;
+
+                    if consecutive_errors >= MAX_CONSECUTIVE_ERRORS {
+                        tracing::error!(
+                            error=%e,
+                            consecutive_errors=%consecutive_errors,
+                            "Too many consecutive ZMQ errors, terminating listener"
+                        );
+                        break;
+                    }
+
+                    // Simple exponential backoff with max exponent to prevent overflow
+                    let backoff_ms = calculate_backoff_ms(consecutive_errors);
+
+                    tracing::warn!(
+                        error=%e,
+                        consecutive_errors=%consecutive_errors,
+                        backoff_ms=%backoff_ms,
+                        "Error reading from ZMQ socket, applying exponential backoff"
+                    );
+
+                    tokio::time::sleep(Duration::from_millis(backoff_ms)).await;
+                    continue;
+                };
+
+                // Reset error count on successful message
+                consecutive_errors = 0;
+
+                // We expect multipart frames: [topic, seq, payload]
+                let mut frames: Vec<Vec<u8>> = msg.into_vec().into_iter().map(|frame| frame.to_vec()).collect();
+
+                if frames.len() != 3 {
+                    tracing::warn!(expected=3, actual=%frames.len(), "Received unexpected ZMQ frame count");
+                    continue;
+                }
+                let payload = frames.remove(2);
+                let seq_bytes = frames.remove(1);
+
+                if seq_bytes.len() != 8 {
+                    tracing::warn!(expected=8, actual=%seq_bytes.len(), "Invalid sequence number byte length");
+                    continue;
+                }
+
+                let seq = u64::from_be_bytes(seq_bytes.try_into().unwrap());
+                if raw_tx.send((seq, payload)).is_err() {
+                    tracing::warn!("Failed to send message to channel - receiver dropped");
+                    break;
+                }
+            }
+        }
+    }
+    tracing::debug!("ZMQ listener exiting");
+}
+
+/// Convert a raw event coming from the ZMQ channel into the internal
+/// [`KvCacheEvent`] representation used by the router. Returns `None` when the
+/// event cannot be represented with the current protocol (e.g., we ignore
+/// `AllBlocksCleared` until a concrete format is defined).
+fn convert_event(
+    raw: RawKvEvent,
+    event_id: u64,
+    kv_block_size: usize,
+    warning_count: &Arc<AtomicU32>,
+) -> Option<KvCacheEvent> {
+    match raw {
+        RawKvEvent::BlockStored {
+            block_hashes,
+            parent_block_hash,
+            token_ids,
+            block_size,
+            lora_id,
+        } => {
+            let num_block_tokens = vec![block_size as u64; block_hashes.len()];
+            Some(KvCacheEvent {
+                event_id,
+                data: KvCacheEventData::Stored(KvCacheStoreData {
+                    parent_hash: parent_block_hash.map(ExternalSequenceBlockHash::from),
+                    blocks: create_stored_blocks(
+                        kv_block_size,
+                        &token_ids,
+                        &num_block_tokens,
+                        &block_hashes,
+                        lora_id.unwrap_or(0),
+                        warning_count,
+                    ),
+                }),
+            })
+        }
+        RawKvEvent::BlockRemoved { block_hashes } => {
+            let hashes = block_hashes
+                .into_iter()
+                .map(ExternalSequenceBlockHash::from)
+                .collect();
+            Some(KvCacheEvent {
+                event_id,
+                data: KvCacheEventData::Removed(KvCacheRemoveData {
+                    block_hashes: hashes,
+                }),
+            })
+        }
+        RawKvEvent::AllBlocksCleared => {
+            tracing::debug!("Received AllBlocksCleared event – currently ignored");
+            None
+        }
+    }
+}
+
+pub fn create_stored_block_from_parts(
+    kv_block_size: usize,
+    block_hash: i64,
+    token_ids: &[u32],
+    _lora_id: u64,
+) -> KvCacheStoredBlockData {
+    let tokens_hash = compute_block_hash_for_seq(token_ids, kv_block_size)[0];
+    KvCacheStoredBlockData {
+        block_hash: ExternalSequenceBlockHash::from(block_hash),
+        tokens_hash,
+    }
+}
+
+pub fn create_stored_blocks(
+    kv_block_size: usize,
+    token_ids: &[u32],
+    num_block_tokens: &[u64],
+    block_hashes: &[i64],
+    lora_id: u64,
+    warning_count: &Arc<AtomicU32>,
+) -> Vec<KvCacheStoredBlockData> {
+    let mut blocks: Vec<KvCacheStoredBlockData> = Vec::new();
+
+    let mut token_offset: usize = 0;
+    for (num_tokens_it, block_hash_it) in num_block_tokens.iter().zip(block_hashes.iter()) {
+        if *num_tokens_it != kv_block_size as u64 {
+            if warning_count.fetch_add(1, Ordering::Relaxed) < 3 {
+                tracing::warn!(
+                    "Block not published. Block size must be {} tokens to be published. Block size is: {}",
+                    kv_block_size,
+                    *num_tokens_it
+                );
+            }
+            break;
+        }
+
+        let tokens = &token_ids[token_offset..(token_offset + *num_tokens_it as usize)];
+        blocks.push(create_stored_block_from_parts(
+            kv_block_size,
+            *block_hash_it,
+            tokens,
+            lora_id,
+        ));
+        token_offset += *num_tokens_it as usize;
+    }
+
+    blocks
+}
+
+// -------------------------------------------------------------------------
+// Types mirroring the Python msgspec-defined structures -------------------
+// -------------------------------------------------------------------------
+
+#[derive(Debug, Deserialize, Serialize)]
+struct KvEventBatch {
+    ts: f64,
+    events: Vec<RawKvEvent>,
+}
+
+#[derive(Debug, Deserialize, Serialize)]
+#[serde(tag = "type")] // msgspec encodes variant tag as a string when `tag=True`
+enum RawKvEvent {
+    BlockStored {
+        block_hashes: Vec<i64>,
+        parent_block_hash: Option<i64>,
+        token_ids: Vec<u32>,
+        block_size: usize,
+        lora_id: Option<u64>,
+    },
+    BlockRemoved {
+        block_hashes: Vec<i64>,
+    },
+    AllBlocksCleared,
+}
+
+// -------------------------------------------------------------------------
+// Metrics Publishers ------------------------------------------------------
+// -------------------------------------------------------------------------
+
 pub struct KvMetricsPublisher {
    tx: tokio::sync::watch::Sender<Arc<ForwardPassMetrics>>,
    rx: tokio::sync::watch::Receiver<Arc<ForwardPassMetrics>>,
@@ -134,3 +516,334 @@ impl AsyncEngine<SingleIn<()>, ManyOut<Annotated<ForwardPassMetrics>>, Error>
        Ok(ResponseStream::new(Box::pin(stream), context))
    }
 }
+
+// -------------------------------------------------------------------------
+// Testing -----------------------------------------------------------------
+// -------------------------------------------------------------------------
+
+#[cfg(test)]
+mod test_event_processing {
+    use super::*;
+    use crate::kv_router::indexer::compute_block_hash_for_seq;
+
+    // ---------------------------------------------------------------------
+    // create_stored_block_from_parts --------------------------------------
+    // ---------------------------------------------------------------------
+    #[test]
+    fn test_create_stored_block_from_parts() {
+        let kv_block_size = 4;
+        let token_ids = vec![10, 20, 30, 40];
+        let blk_hash = 0xdead_beef;
+
+        let stored = create_stored_block_from_parts(kv_block_size, blk_hash, &token_ids, 0);
+
+        assert_eq!(stored.block_hash.0, blk_hash as u64);
+        let expected_hash = compute_block_hash_for_seq(&token_ids, 4)[0];
+        assert_eq!(stored.tokens_hash, expected_hash);
+    }
+
+    // ---------------------------------------------------------------------
+    // create_stored_blocks -------------------------------------------------
+    // ---------------------------------------------------------------------
+    #[test]
+    fn test_create_stored_blocks_ok() {
+        let kv_block_size = 4;
+        // two blocks, each of size 4
+        let token_ids = vec![1, 2, 3, 4, 5, 6, 7, 8];
+        let num_block_tokens = vec![4_u64, 4_u64];
+        let block_hashes = vec![111_i64, 222_i64];
+
+        let blocks = create_stored_blocks(
+            kv_block_size,
+            &token_ids,
+            &num_block_tokens,
+            &block_hashes,
+            /*lora_id=*/ 0,
+            &Arc::new(AtomicU32::new(0)),
+        );
+
+        assert_eq!(blocks.len(), 2);
+        assert_eq!(blocks[0].block_hash.0, 111);
+        assert_eq!(blocks[1].block_hash.0, 222);
+    }
+
+    #[test]
+    fn test_create_stored_blocks_wrong_size_triggers_warning() {
+        let kv_block_size = 4;
+        // second block is the wrong size
+        let token_ids = vec![1, 2, 3, 4, 5, 6, 7];
+        let num_block_tokens = vec![4_u64, 3_u64];
+        let block_hashes = vec![111_i64, 222_i64];
+        let warning_count = Arc::new(AtomicU32::new(0));
+
+        let blocks = create_stored_blocks(
+            kv_block_size,
+            &token_ids,
+            &num_block_tokens,
+            &block_hashes,
+            /*lora_id=*/ 0,
+            &warning_count,
+        );
+
+        // should early-exit as second has mismatch
+        assert!(blocks.len() == 1);
+        assert!(warning_count.load(Ordering::Relaxed) == 1)
+    }
+
+    // ---------------------------------------------------------------------
+    // convert_event --------------------------------------------------------
+    // ---------------------------------------------------------------------
+    #[test]
+    fn test_convert_event_block_stored() {
+        let kv_block_size = 4;
+        let raw_evt = RawKvEvent::BlockStored {
+            block_hashes: vec![10, 11],
+            parent_block_hash: Some(99),
+            token_ids: vec![1, 2, 3, 4, 5, 6, 7, 8],
+            block_size: 4,
+            lora_id: Some(0),
+        };
+
+        let out = convert_event(raw_evt, 42, kv_block_size, &Arc::new(AtomicU32::new(0)));
+        assert!(matches!(out.unwrap().data, KvCacheEventData::Stored(_)));
+    }
+
+    #[test]
+    fn test_convert_event_block_removed() {
+        let kv_block_size = 4;
+        let raw_evt = RawKvEvent::BlockRemoved {
+            block_hashes: vec![123, 456],
+        };
+        let out = convert_event(raw_evt, 7, kv_block_size, &Arc::new(AtomicU32::new(0)));
+
+        assert!(matches!(out.unwrap().data, KvCacheEventData::Removed(_)));
+    }
+
+    #[test]
+    fn test_convert_event_all_blocks_cleared() {
+        let kv_block_size = 4;
+        let raw_evt = RawKvEvent::AllBlocksCleared;
+        assert!(convert_event(raw_evt, 1, kv_block_size, &Arc::new(AtomicU32::new(0))).is_none());
+    }
+}
+
+#[cfg(test)]
+mod tests_startup_helpers {
+    use super::*;
+    use async_trait;
+    use bytes::Bytes;
+    use std::sync::{Arc, Mutex};
+    use zeromq::{PubSocket, Socket, SocketSend, ZmqMessage};
+
+    // Type alias to resolve clippy::type_complexity warning
+    type PublishedEvents = Arc<Mutex<Vec<(String, Vec<u8>)>>>;
+
+    //--------------------------------------------------------------------
+    // A tiny stand-in for Component that just records every publish call
+    //--------------------------------------------------------------------
+    #[derive(Default)]
+    struct MockComponent {
+        published: PublishedEvents,
+    }
+
+    impl MockComponent {
+        fn new() -> (Self, PublishedEvents) {
+            let published = Arc::new(Mutex::new(Vec::new()));
+            (
+                Self {
+                    published: published.clone(),
+                },
+                published,
+            )
+        }
+    }
+
+    #[async_trait::async_trait]
+    impl EventPublisher for MockComponent {
+        async fn publish(
+            &self,
+            event_name: impl AsRef<str> + Send + Sync,
+            event: &(impl serde::Serialize + Send + Sync),
+        ) -> dynamo_runtime::Result<()> {
+            let bytes = rmp_serde::to_vec(event).unwrap();
+            self.published
+                .lock()
+                .unwrap()
+                .push((event_name.as_ref().to_string(), bytes));
+            Ok(())
+        }
+
+        async fn publish_bytes(
+            &self,
+            event_name: impl AsRef<str> + Send + Sync,
+            bytes: Vec<u8>,
+        ) -> dynamo_runtime::Result<()> {
+            self.published
+                .lock()
+                .unwrap()
+                .push((event_name.as_ref().to_string(), bytes));
+            Ok(())
+        }
+
+        fn subject(&self) -> String {
+            "mock.subject".into()
+        }
+    }
+
+    //--------------------------------------------------------------------
+    // Test start_event_processor in isolation
+    //--------------------------------------------------------------------
+    #[tokio::test]
+    async fn test_start_event_processor_sends_router_event() {
+        let kv_block_size = 4;
+        let worker_id = 99;
+
+        // 1) build a one-item KvEventBatch and msgpack-encode it
+        let batch = KvEventBatch {
+            ts: 0.0,
+            events: vec![RawKvEvent::BlockRemoved {
+                block_hashes: vec![1, 2],
+            }],
+        };
+        let payload = rmps::to_vec(&batch).unwrap();
+
+        let token = dynamo_runtime::CancellationToken::new();
+
+        // 2) channel feeding the processor
+        let (tx, rx) = mpsc::unbounded_channel::<(u64, Vec<u8>)>();
+        tx.send((123, payload.clone())).unwrap(); // seq = 123
+        drop(tx);
+
+        // 3) mock component to capture output
+        let (comp, published) = MockComponent::new();
+
+        // 4) run the function under test (let it consume exactly one msg)
+        let handle = tokio::spawn(start_event_processor(
+            rx,
+            comp,
+            worker_id,
+            kv_block_size,
+            Arc::new(AtomicU32::new(0)),
+            token,
+        ));
+
+        tokio::time::timeout(std::time::Duration::from_secs(1), handle)
+            .await
+            .unwrap()
+            .unwrap();
+
+        // 5) assert we have exactly one RouterEvent pushed with right worker_id
+        let published = published.lock().unwrap();
+        let (subject, bytes) = &published[0];
+
+        assert_eq!(subject, &KV_EVENT_SUBJECT.to_string());
+        assert_eq!(bytes.first(), payload.first())
+    }
+
+    //--------------------------------------------------------------------
+    // Test start_zmq_listener without a real socket
+    //   (feed it frames through a ZMQ PAIR tcp socket)
+    //--------------------------------------------------------------------
+    #[tokio::test]
+    async fn test_start_zmq_listener_pushes_to_channel() {
+        // Prepare channel that listener should fill
+        let (tx, mut rx) = mpsc::unbounded_channel::<(u64, Vec<u8>)>();
+
+        // ZMQ TCP endpoint using localhost with fixed port
+        let endpoint = "tcp://127.0.0.1:15555";
+        let topic = "".to_string(); // subscribe to all
+
+        // Publisher side - set up first
+        let mut pub_socket = PubSocket::new();
+        pub_socket.bind(endpoint).await.unwrap();
+
+        // Cancellation token so we can stop the listener
+        let token = dynamo_runtime::CancellationToken::new();
+
+        // Spawn async listener
+        let listener_handle = tokio::spawn({
+            let token = token.clone();
+            start_zmq_listener(endpoint.to_string(), topic, tx, token)
+        });
+
+        // Give time for the connection to establish
+        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
+
+        // Send synthetic 3-frame message: [topic, seq(8B), payload]
+        let seq: u64 = 77;
+        let payload = Bytes::from("hello");
+
+        let frames = vec![
+            Bytes::from(""),
+            Bytes::from(seq.to_be_bytes().to_vec()),
+            payload.clone(),
+        ];
+
+        // Create a proper multipart message
+        let msg = ZmqMessage::try_from(frames).expect("Failed to create ZmqMessage");
+
+        // Send the multipart message
+        pub_socket.send(msg).await.unwrap();
+
+        // Wait for message to be received
+        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
+
+        // Check that we received the message
+        let (got_seq, got_payload) = rx.try_recv().expect("no message received");
+        assert_eq!(got_seq, seq);
+        assert_eq!(got_payload, payload);
+
+        // Stop the listener
+        token.cancel();
+        let _ = listener_handle.await;
+    }
+}
+
+#[cfg(test)]
+mod test_exponential_backoff {
+    use super::*;
+
+    #[test]
+    fn test_backoff_calculation_progression() {
+        // Test the exponential progression
+        assert_eq!(calculate_backoff_ms(0), 10); // 10 * 2^0 = 10
+        assert_eq!(calculate_backoff_ms(1), 20); // 10 * 2^1 = 20
+        assert_eq!(calculate_backoff_ms(2), 40); // 10 * 2^2 = 40
+        assert_eq!(calculate_backoff_ms(3), 80); // 10 * 2^3 = 80
+        assert_eq!(calculate_backoff_ms(4), 160); // 10 * 2^4 = 160
+        assert_eq!(calculate_backoff_ms(5), 320); // 10 * 2^5 = 320
+        assert_eq!(calculate_backoff_ms(6), 640); // 10 * 2^6 = 640
+        assert_eq!(calculate_backoff_ms(7), 1280); // 10 * 2^7 = 1280
+        assert_eq!(calculate_backoff_ms(8), 2560); // 10 * 2^8 = 2560
+    }
+
+    #[test]
+    fn test_backoff_caps_at_max_exponent() {
+        // After MAX_BACKOFF_EXPONENT, should stay at 2^8 = 2560ms
+        assert_eq!(calculate_backoff_ms(8), 2560);
+        assert_eq!(calculate_backoff_ms(9), 2560); // Same as 8
+        assert_eq!(calculate_backoff_ms(100), 2560); // Same as 8
+    }
+
+    #[test]
+    fn test_backoff_never_exceeds_max() {
+        // Even if we somehow had a huge exponent, never exceed MAX_BACKOFF_MS
+        for i in 0..20 {
+            assert!(calculate_backoff_ms(i) <= MAX_BACKOFF_MS);
+        }
+    }
+
+    #[test]
+    #[allow(clippy::assertions_on_constants)]
+    fn test_backoff_constants_are_sane() {
+        // Verify our constants make sense together
+        assert!(INITIAL_BACKOFF_MS > 0);
+        assert!(MAX_BACKOFF_MS > INITIAL_BACKOFF_MS);
+        assert!(MAX_BACKOFF_EXPONENT <= 10); // Prevent crazy exponents
+        assert!(MAX_CONSECUTIVE_ERRORS > 0);
+
+        // Max calculated value should be less than MAX_BACKOFF_MS
+        let max_calculated = INITIAL_BACKOFF_MS * 2_u64.pow(MAX_BACKOFF_EXPONENT);
+        assert!(max_calculated <= MAX_BACKOFF_MS);
+    }
+}
--- a/lib/llm/src/kv_router/scheduler.rs
+++ b/lib/llm/src/kv_router/scheduler.rs
@@ -277,8 +277,7 @@ impl WorkerSelector for DefaultWorkerSelector {
            let score = worker_scores.get(&worker_id).copied().unwrap_or(0.0);

            // Calculate normalized metrics
-            assert!(ep.data.kv_total_blocks > 0);
-            let gpu_cache_usage = ep.data.kv_active_blocks as f64 / ep.data.kv_total_blocks as f64;
+            let gpu_cache_usage = ep.data.gpu_cache_usage_perc as f64;
            let normalized_waiting = if max_waiting > 0.0 {
                ep.data.num_requests_waiting as f64 / max_waiting
            } else {

--- a/lib/llm/src/mocker/scheduler.rs
+++ b/lib/llm/src/mocker/scheduler.rs
@@ -393,6 +393,7 @@ impl Scheduler {
        };

        ForwardPassMetrics {
+            data_parallel_rank: None, // Default for backwards compatibility
            request_active_slots: state.running.len() as u64,
            request_total_slots: 420, // Dummy value as specified
            kv_active_blocks: active_blocks_count,