chore: enable local indexers by default, and use normal event plane by default...

chore: enable local indexers by default, and use normal event plane by default (not jetstream) (#5941) Signed-off-by: PeaBrane <yanrpei@gmail.com>

chore: enable local indexers by default, and use normal event plane by default...
chore: enable local indexers by default, and use normal event plane by default (not jetstream) (#5941) Signed-off-by: PeaBrane <yanrpei@gmail.com>
6783bdca · Yan Ru Pei · GitHub · 3d7182b8 · 6783bdca · 6783bdca
Unverified Commit 6783bdca authored Feb 08, 2026 by Yan Ru Pei Committed by GitHub Feb 08, 2026
Showing with 38 additions and 36 deletions

tests/router/test_router_e2e_with_trtllm.py tests/router/test_router_e2e_with_trtllm.py +19 -18

tests/router/test_router_e2e_with_vllm.py tests/router/test_router_e2e_with_vllm.py +19 -18

No files found.
--- a/tests/router/test_router_e2e_with_trtllm.py
+++ b/tests/router/test_router_e2e_with_trtllm.py
@@ -84,7 +84,7 @@ class TRTLLMProcess:
        single_gpu: bool = False,
        request_plane: str = "tcp",
        store_backend: str = "etcd",
-        enable_local_indexer: bool = False,
+        durable_kv_events: bool = False,
    ):
        """Initialize TRT-LLM workers with dynamo integration.

@@ -104,7 +104,7 @@ class TRTLLMProcess:
            single_gpu: If True, all workers share GPU 0
            request_plane: Request plane to use ("nats", "tcp", or "http"). Defaults to "tcp".
            store_backend: Storage backend to use ("etcd" or "file"). Defaults to "etcd".
-            enable_local_indexer: If True, enable worker-local KV indexer for NATS Core mode. Defaults to False.
+            durable_kv_events: If True, use JetStream for durable KV events. Defaults to False (NATS Core mode).

        Note: TRT-LLM supports two forms of parallelism for routing:
              1. Multiple workers (num_workers > 1): Each worker is a separate routing target
@@ -168,6 +168,10 @@ class TRTLLMProcess:
            if max_seq_len is not None:
                command.extend(["--max-seq-len", str(max_seq_len)])

+            # Use --durable-kv-events to enable JetStream mode (local indexer disabled)
+            if durable_kv_events:
+                command.append("--durable-kv-events")
+
            # Set tensor parallel size if specified (needed for attention DP)
            if tensor_parallel_size is not None:
                command.extend(["--tensor-parallel-size", str(tensor_parallel_size)])
@@ -194,10 +198,6 @@ class TRTLLMProcess:
            if self.store_backend == "file" and "DYN_FILE_KV" in os.environ:
                env_vars["DYN_FILE_KV"] = os.environ["DYN_FILE_KV"]

-            # Enable local indexer for NATS Core mode
-            if enable_local_indexer:
-                env_vars["DYN_LOCAL_INDEXER"] = "true"
-
            env.update(env_vars)

            # Create managed process for the worker
@@ -485,13 +485,12 @@ def test_router_decisions_trtllm_multiple_workers(
 @pytest.mark.gpu_1
 @pytest.mark.timeout(150)  # ~3x average (~45s/test), rounded up
 @pytest.mark.parametrize(
-    "store_backend,use_nats_core,request_plane",
+    "store_backend,durable_kv_events,request_plane",
    [
-        ("etcd", False, "nats"),  # JetStream mode
-        # ("etcd", True, "tcp"),  # nats_core mode - disabled for now
-        # ("file", False, "nats"),  # File backend - TODO: investigate file backend support for TRT-LLM
+        ("etcd", False, "tcp"),
    ],
-    ids=["jetstream"],
+    ids=["nats_core"],
+    indirect=["durable_kv_events", "request_plane"],
 )
 def test_trtllm_indexers_sync(
    request,
@@ -500,7 +499,7 @@ def test_trtllm_indexers_sync(
    file_storage_backend,
    set_ucx_tls_no_mm,
    store_backend,
-    use_nats_core,
+    durable_kv_events,
    request_plane,
 ):
    """
@@ -508,15 +507,15 @@ def test_trtllm_indexers_sync(
    with TRT-LLM workers. This test verifies that both routers converge to the same internal state.

    Tests with configuration:
-    - jetstream: etcd backend, JetStream for KV events, NATS request plane
-    - tcp_nats_core: etcd backend, local indexer with NATS Core, TCP request plane
+    - nats_core: etcd backend, local indexer with NATS Core, TCP request plane
+                 (includes NATS interruption/recovery testing)
    """
    # runtime_services_dynamic_ports handles NATS and etcd startup
    nats_process, _etcd_process = runtime_services_dynamic_ports

    logger.info(
        f"Starting TRT-LLM indexers sync test: store_backend={store_backend}, "
-        f"use_nats_core={use_nats_core}, request_plane={request_plane}"
+        f"durable_kv_events={durable_kv_events}, request_plane={request_plane}"
    )

    N_TRTLLM_WORKERS = 2
@@ -531,13 +530,14 @@ def test_trtllm_indexers_sync(
            single_gpu=True,  # fit workers into one GPU
            request_plane=request_plane,
            store_backend=store_backend,
-            enable_local_indexer=use_nats_core,
+            durable_kv_events=durable_kv_events,
        )
        logger.info(f"All TRT-LLM workers using namespace: {trtllm_workers.namespace}")
        trtllm_workers.__enter__()

        # Use the common test implementation (creates its own runtimes for each router)
        # Note: Consumer verification is done inside _test_router_indexers_sync while routers are alive
+        # When using durable_kv_events=True, use JetStream mode for the router
        _test_router_indexers_sync(
            engine_workers=trtllm_workers,
            block_size=TRTLLM_BLOCK_SIZE,
@@ -545,8 +545,9 @@ def test_trtllm_indexers_sync(
            num_workers=N_TRTLLM_WORKERS,
            store_backend=store_backend,
            request_plane=request_plane,
-            test_nats_interruption=use_nats_core,
-            nats_server=nats_process if use_nats_core else None,
+            test_nats_interruption=not durable_kv_events,
+            nats_server=nats_process if not durable_kv_events else None,
+            durable_kv_events=durable_kv_events,
        )

        logger.info("TRT-LLM indexers sync test completed successfully")

--- a/tests/router/test_router_e2e_with_vllm.py
+++ b/tests/router/test_router_e2e_with_vllm.py
@@ -87,7 +87,7 @@ class VLLMProcess:
        data_parallel_size: Optional[int] = None,
        request_plane: str = "tcp",
        store_backend: str = "etcd",
-        enable_local_indexer: bool = False,
+        durable_kv_events: bool = False,
    ):
        """Initialize vLLM workers with dynamo integration.

@@ -105,7 +105,7 @@ class VLLMProcess:
            data_parallel_size: If set, enables data parallelism with this many ranks (num_workers must equal data_parallel_size)
            request_plane: Request plane to use ("nats", "tcp", or "http"). Defaults to "tcp".
            store_backend: Storage backend to use ("etcd" or "file"). Defaults to "etcd".
-            enable_local_indexer: If True, enable worker-local KV indexer for NATS Core mode. Defaults to False.
+            durable_kv_events: If True, use JetStream for durable KV events. Defaults to False (NATS Core mode).
        """
        # Generate unique namespace for isolation
        namespace_suffix = generate_random_suffix()
@@ -197,6 +197,10 @@ class VLLMProcess:
                    ]
                )

+            # Use --durable-kv-events to enable JetStream mode (local indexer disabled)
+            if durable_kv_events:
+                command.append("--durable-kv-events")
+
            env = os.environ.copy()  # Copy parent environment
            env_vars = {
                "CUDA_VISIBLE_DEVICES": gpu_device,
@@ -211,10 +215,6 @@ class VLLMProcess:
            if self.store_backend == "file" and "DYN_FILE_KV" in os.environ:
                env_vars["DYN_FILE_KV"] = os.environ["DYN_FILE_KV"]

-            # Enable local indexer for NATS Core mode
-            if enable_local_indexer:
-                env_vars["DYN_LOCAL_INDEXER"] = "true"
-
            env.update(env_vars)

            # Create managed process for the worker
@@ -487,13 +487,12 @@ def test_router_decisions_vllm_dp(
 @pytest.mark.gpu_1
 @pytest.mark.timeout(150)  # ~3x average (~43s/test), rounded up
 @pytest.mark.parametrize(
-    "store_backend,use_nats_core,request_plane",
+    "store_backend,durable_kv_events,request_plane",
    [
-        ("etcd", False, "nats"),  # JetStream mode
-        # ("etcd", True, "tcp"),  # nats_core mode - disabled for now
-        # ("file", False, "nats"),  # File backend
+        ("etcd", False, "tcp"),
    ],
-    ids=["jetstream"],
+    ids=["nats_core"],
+    indirect=["durable_kv_events", "request_plane"],
 )
 def test_vllm_indexers_sync(
    request,
@@ -502,7 +501,7 @@ def test_vllm_indexers_sync(
    file_storage_backend,
    set_ucx_tls_no_mm,
    store_backend,
-    use_nats_core,
+    durable_kv_events,
    request_plane,
 ):
    """
@@ -510,15 +509,15 @@ def test_vllm_indexers_sync(
    with vLLM workers. This test verifies that both routers converge to the same internal state.

    Tests with configuration:
-    - jetstream: etcd backend, JetStream for KV events, NATS request plane
-    - tcp_nats_core: etcd backend, local indexer with NATS Core, TCP request plane
+    - nats_core: etcd backend, local indexer with NATS Core, TCP request plane
+                 (includes NATS interruption/recovery testing)
    """
    # runtime_services_dynamic_ports handles NATS and etcd startup
    nats_process, _etcd_process = runtime_services_dynamic_ports

    logger.info(
        f"Starting vLLM indexers sync test: store_backend={store_backend}, "
-        f"use_nats_core={use_nats_core}, request_plane={request_plane}"
+        f"durable_kv_events={durable_kv_events}, request_plane={request_plane}"
    )

    N_VLLM_WORKERS = 2
@@ -533,13 +532,14 @@ def test_vllm_indexers_sync(
            single_gpu=True,  # fit workers into one GPU
            request_plane=request_plane,
            store_backend=store_backend,
-            enable_local_indexer=use_nats_core,
+            durable_kv_events=durable_kv_events,
        )
        logger.info(f"All vLLM workers using namespace: {vllm_workers.namespace}")
        vllm_workers.__enter__()

        # Use the common test implementation (creates its own runtimes for each router)
        # Note: Consumer verification is done inside _test_router_indexers_sync while routers are alive
+        # When using durable_kv_events=True, use JetStream mode for the router
        _test_router_indexers_sync(
            engine_workers=vllm_workers,
            block_size=BLOCK_SIZE,
@@ -547,8 +547,9 @@ def test_vllm_indexers_sync(
            num_workers=N_VLLM_WORKERS,
            store_backend=store_backend,
            request_plane=request_plane,
-            test_nats_interruption=use_nats_core,
-            nats_server=nats_process if use_nats_core else None,
+            test_nats_interruption=not durable_kv_events,
+            nats_server=nats_process if not durable_kv_events else None,
+            durable_kv_events=durable_kv_events,
        )

        logger.info("vLLM indexers sync test completed successfully")