Unverified Commit 6783bdca authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

chore: enable local indexers by default, and use normal event plane by default...


chore: enable local indexers by default, and use normal event plane by default (not jetstream) (#5941)
Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent 3d7182b8
......@@ -84,7 +84,7 @@ class TRTLLMProcess:
single_gpu: bool = False,
request_plane: str = "tcp",
store_backend: str = "etcd",
enable_local_indexer: bool = False,
durable_kv_events: bool = False,
):
"""Initialize TRT-LLM workers with dynamo integration.
......@@ -104,7 +104,7 @@ class TRTLLMProcess:
single_gpu: If True, all workers share GPU 0
request_plane: Request plane to use ("nats", "tcp", or "http"). Defaults to "tcp".
store_backend: Storage backend to use ("etcd" or "file"). Defaults to "etcd".
enable_local_indexer: If True, enable worker-local KV indexer for NATS Core mode. Defaults to False.
durable_kv_events: If True, use JetStream for durable KV events. Defaults to False (NATS Core mode).
Note: TRT-LLM supports two forms of parallelism for routing:
1. Multiple workers (num_workers > 1): Each worker is a separate routing target
......@@ -168,6 +168,10 @@ class TRTLLMProcess:
if max_seq_len is not None:
command.extend(["--max-seq-len", str(max_seq_len)])
# Use --durable-kv-events to enable JetStream mode (local indexer disabled)
if durable_kv_events:
command.append("--durable-kv-events")
# Set tensor parallel size if specified (needed for attention DP)
if tensor_parallel_size is not None:
command.extend(["--tensor-parallel-size", str(tensor_parallel_size)])
......@@ -194,10 +198,6 @@ class TRTLLMProcess:
if self.store_backend == "file" and "DYN_FILE_KV" in os.environ:
env_vars["DYN_FILE_KV"] = os.environ["DYN_FILE_KV"]
# Enable local indexer for NATS Core mode
if enable_local_indexer:
env_vars["DYN_LOCAL_INDEXER"] = "true"
env.update(env_vars)
# Create managed process for the worker
......@@ -485,13 +485,12 @@ def test_router_decisions_trtllm_multiple_workers(
@pytest.mark.gpu_1
@pytest.mark.timeout(150) # ~3x average (~45s/test), rounded up
@pytest.mark.parametrize(
"store_backend,use_nats_core,request_plane",
"store_backend,durable_kv_events,request_plane",
[
("etcd", False, "nats"), # JetStream mode
# ("etcd", True, "tcp"), # nats_core mode - disabled for now
# ("file", False, "nats"), # File backend - TODO: investigate file backend support for TRT-LLM
("etcd", False, "tcp"),
],
ids=["jetstream"],
ids=["nats_core"],
indirect=["durable_kv_events", "request_plane"],
)
def test_trtllm_indexers_sync(
request,
......@@ -500,7 +499,7 @@ def test_trtllm_indexers_sync(
file_storage_backend,
set_ucx_tls_no_mm,
store_backend,
use_nats_core,
durable_kv_events,
request_plane,
):
"""
......@@ -508,15 +507,15 @@ def test_trtllm_indexers_sync(
with TRT-LLM workers. This test verifies that both routers converge to the same internal state.
Tests with configuration:
- jetstream: etcd backend, JetStream for KV events, NATS request plane
- tcp_nats_core: etcd backend, local indexer with NATS Core, TCP request plane
- nats_core: etcd backend, local indexer with NATS Core, TCP request plane
(includes NATS interruption/recovery testing)
"""
# runtime_services_dynamic_ports handles NATS and etcd startup
nats_process, _etcd_process = runtime_services_dynamic_ports
logger.info(
f"Starting TRT-LLM indexers sync test: store_backend={store_backend}, "
f"use_nats_core={use_nats_core}, request_plane={request_plane}"
f"durable_kv_events={durable_kv_events}, request_plane={request_plane}"
)
N_TRTLLM_WORKERS = 2
......@@ -531,13 +530,14 @@ def test_trtllm_indexers_sync(
single_gpu=True, # fit workers into one GPU
request_plane=request_plane,
store_backend=store_backend,
enable_local_indexer=use_nats_core,
durable_kv_events=durable_kv_events,
)
logger.info(f"All TRT-LLM workers using namespace: {trtllm_workers.namespace}")
trtllm_workers.__enter__()
# Use the common test implementation (creates its own runtimes for each router)
# Note: Consumer verification is done inside _test_router_indexers_sync while routers are alive
# When using durable_kv_events=True, use JetStream mode for the router
_test_router_indexers_sync(
engine_workers=trtllm_workers,
block_size=TRTLLM_BLOCK_SIZE,
......@@ -545,8 +545,9 @@ def test_trtllm_indexers_sync(
num_workers=N_TRTLLM_WORKERS,
store_backend=store_backend,
request_plane=request_plane,
test_nats_interruption=use_nats_core,
nats_server=nats_process if use_nats_core else None,
test_nats_interruption=not durable_kv_events,
nats_server=nats_process if not durable_kv_events else None,
durable_kv_events=durable_kv_events,
)
logger.info("TRT-LLM indexers sync test completed successfully")
......
......@@ -87,7 +87,7 @@ class VLLMProcess:
data_parallel_size: Optional[int] = None,
request_plane: str = "tcp",
store_backend: str = "etcd",
enable_local_indexer: bool = False,
durable_kv_events: bool = False,
):
"""Initialize vLLM workers with dynamo integration.
......@@ -105,7 +105,7 @@ class VLLMProcess:
data_parallel_size: If set, enables data parallelism with this many ranks (num_workers must equal data_parallel_size)
request_plane: Request plane to use ("nats", "tcp", or "http"). Defaults to "tcp".
store_backend: Storage backend to use ("etcd" or "file"). Defaults to "etcd".
enable_local_indexer: If True, enable worker-local KV indexer for NATS Core mode. Defaults to False.
durable_kv_events: If True, use JetStream for durable KV events. Defaults to False (NATS Core mode).
"""
# Generate unique namespace for isolation
namespace_suffix = generate_random_suffix()
......@@ -197,6 +197,10 @@ class VLLMProcess:
]
)
# Use --durable-kv-events to enable JetStream mode (local indexer disabled)
if durable_kv_events:
command.append("--durable-kv-events")
env = os.environ.copy() # Copy parent environment
env_vars = {
"CUDA_VISIBLE_DEVICES": gpu_device,
......@@ -211,10 +215,6 @@ class VLLMProcess:
if self.store_backend == "file" and "DYN_FILE_KV" in os.environ:
env_vars["DYN_FILE_KV"] = os.environ["DYN_FILE_KV"]
# Enable local indexer for NATS Core mode
if enable_local_indexer:
env_vars["DYN_LOCAL_INDEXER"] = "true"
env.update(env_vars)
# Create managed process for the worker
......@@ -487,13 +487,12 @@ def test_router_decisions_vllm_dp(
@pytest.mark.gpu_1
@pytest.mark.timeout(150) # ~3x average (~43s/test), rounded up
@pytest.mark.parametrize(
"store_backend,use_nats_core,request_plane",
"store_backend,durable_kv_events,request_plane",
[
("etcd", False, "nats"), # JetStream mode
# ("etcd", True, "tcp"), # nats_core mode - disabled for now
# ("file", False, "nats"), # File backend
("etcd", False, "tcp"),
],
ids=["jetstream"],
ids=["nats_core"],
indirect=["durable_kv_events", "request_plane"],
)
def test_vllm_indexers_sync(
request,
......@@ -502,7 +501,7 @@ def test_vllm_indexers_sync(
file_storage_backend,
set_ucx_tls_no_mm,
store_backend,
use_nats_core,
durable_kv_events,
request_plane,
):
"""
......@@ -510,15 +509,15 @@ def test_vllm_indexers_sync(
with vLLM workers. This test verifies that both routers converge to the same internal state.
Tests with configuration:
- jetstream: etcd backend, JetStream for KV events, NATS request plane
- tcp_nats_core: etcd backend, local indexer with NATS Core, TCP request plane
- nats_core: etcd backend, local indexer with NATS Core, TCP request plane
(includes NATS interruption/recovery testing)
"""
# runtime_services_dynamic_ports handles NATS and etcd startup
nats_process, _etcd_process = runtime_services_dynamic_ports
logger.info(
f"Starting vLLM indexers sync test: store_backend={store_backend}, "
f"use_nats_core={use_nats_core}, request_plane={request_plane}"
f"durable_kv_events={durable_kv_events}, request_plane={request_plane}"
)
N_VLLM_WORKERS = 2
......@@ -533,13 +532,14 @@ def test_vllm_indexers_sync(
single_gpu=True, # fit workers into one GPU
request_plane=request_plane,
store_backend=store_backend,
enable_local_indexer=use_nats_core,
durable_kv_events=durable_kv_events,
)
logger.info(f"All vLLM workers using namespace: {vllm_workers.namespace}")
vllm_workers.__enter__()
# Use the common test implementation (creates its own runtimes for each router)
# Note: Consumer verification is done inside _test_router_indexers_sync while routers are alive
# When using durable_kv_events=True, use JetStream mode for the router
_test_router_indexers_sync(
engine_workers=vllm_workers,
block_size=BLOCK_SIZE,
......@@ -547,8 +547,9 @@ def test_vllm_indexers_sync(
num_workers=N_VLLM_WORKERS,
store_backend=store_backend,
request_plane=request_plane,
test_nats_interruption=use_nats_core,
nats_server=nats_process if use_nats_core else None,
test_nats_interruption=not durable_kv_events,
nats_server=nats_process if not durable_kv_events else None,
durable_kv_events=durable_kv_events,
)
logger.info("vLLM indexers sync test completed successfully")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment