Unverified Commit 6783bdca authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

chore: enable local indexers by default, and use normal event plane by default...


chore: enable local indexers by default, and use normal event plane by default (not jetstream) (#5941)
Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent 3d7182b8
...@@ -84,7 +84,7 @@ class TRTLLMProcess: ...@@ -84,7 +84,7 @@ class TRTLLMProcess:
single_gpu: bool = False, single_gpu: bool = False,
request_plane: str = "tcp", request_plane: str = "tcp",
store_backend: str = "etcd", store_backend: str = "etcd",
enable_local_indexer: bool = False, durable_kv_events: bool = False,
): ):
"""Initialize TRT-LLM workers with dynamo integration. """Initialize TRT-LLM workers with dynamo integration.
...@@ -104,7 +104,7 @@ class TRTLLMProcess: ...@@ -104,7 +104,7 @@ class TRTLLMProcess:
single_gpu: If True, all workers share GPU 0 single_gpu: If True, all workers share GPU 0
request_plane: Request plane to use ("nats", "tcp", or "http"). Defaults to "tcp". request_plane: Request plane to use ("nats", "tcp", or "http"). Defaults to "tcp".
store_backend: Storage backend to use ("etcd" or "file"). Defaults to "etcd". store_backend: Storage backend to use ("etcd" or "file"). Defaults to "etcd".
enable_local_indexer: If True, enable worker-local KV indexer for NATS Core mode. Defaults to False. durable_kv_events: If True, use JetStream for durable KV events. Defaults to False (NATS Core mode).
Note: TRT-LLM supports two forms of parallelism for routing: Note: TRT-LLM supports two forms of parallelism for routing:
1. Multiple workers (num_workers > 1): Each worker is a separate routing target 1. Multiple workers (num_workers > 1): Each worker is a separate routing target
...@@ -168,6 +168,10 @@ class TRTLLMProcess: ...@@ -168,6 +168,10 @@ class TRTLLMProcess:
if max_seq_len is not None: if max_seq_len is not None:
command.extend(["--max-seq-len", str(max_seq_len)]) command.extend(["--max-seq-len", str(max_seq_len)])
# Use --durable-kv-events to enable JetStream mode (local indexer disabled)
if durable_kv_events:
command.append("--durable-kv-events")
# Set tensor parallel size if specified (needed for attention DP) # Set tensor parallel size if specified (needed for attention DP)
if tensor_parallel_size is not None: if tensor_parallel_size is not None:
command.extend(["--tensor-parallel-size", str(tensor_parallel_size)]) command.extend(["--tensor-parallel-size", str(tensor_parallel_size)])
...@@ -194,10 +198,6 @@ class TRTLLMProcess: ...@@ -194,10 +198,6 @@ class TRTLLMProcess:
if self.store_backend == "file" and "DYN_FILE_KV" in os.environ: if self.store_backend == "file" and "DYN_FILE_KV" in os.environ:
env_vars["DYN_FILE_KV"] = os.environ["DYN_FILE_KV"] env_vars["DYN_FILE_KV"] = os.environ["DYN_FILE_KV"]
# Enable local indexer for NATS Core mode
if enable_local_indexer:
env_vars["DYN_LOCAL_INDEXER"] = "true"
env.update(env_vars) env.update(env_vars)
# Create managed process for the worker # Create managed process for the worker
...@@ -485,13 +485,12 @@ def test_router_decisions_trtllm_multiple_workers( ...@@ -485,13 +485,12 @@ def test_router_decisions_trtllm_multiple_workers(
@pytest.mark.gpu_1 @pytest.mark.gpu_1
@pytest.mark.timeout(150) # ~3x average (~45s/test), rounded up @pytest.mark.timeout(150) # ~3x average (~45s/test), rounded up
@pytest.mark.parametrize( @pytest.mark.parametrize(
"store_backend,use_nats_core,request_plane", "store_backend,durable_kv_events,request_plane",
[ [
("etcd", False, "nats"), # JetStream mode ("etcd", False, "tcp"),
# ("etcd", True, "tcp"), # nats_core mode - disabled for now
# ("file", False, "nats"), # File backend - TODO: investigate file backend support for TRT-LLM
], ],
ids=["jetstream"], ids=["nats_core"],
indirect=["durable_kv_events", "request_plane"],
) )
def test_trtllm_indexers_sync( def test_trtllm_indexers_sync(
request, request,
...@@ -500,7 +499,7 @@ def test_trtllm_indexers_sync( ...@@ -500,7 +499,7 @@ def test_trtllm_indexers_sync(
file_storage_backend, file_storage_backend,
set_ucx_tls_no_mm, set_ucx_tls_no_mm,
store_backend, store_backend,
use_nats_core, durable_kv_events,
request_plane, request_plane,
): ):
""" """
...@@ -508,15 +507,15 @@ def test_trtllm_indexers_sync( ...@@ -508,15 +507,15 @@ def test_trtllm_indexers_sync(
with TRT-LLM workers. This test verifies that both routers converge to the same internal state. with TRT-LLM workers. This test verifies that both routers converge to the same internal state.
Tests with configuration: Tests with configuration:
- jetstream: etcd backend, JetStream for KV events, NATS request plane - nats_core: etcd backend, local indexer with NATS Core, TCP request plane
- tcp_nats_core: etcd backend, local indexer with NATS Core, TCP request plane (includes NATS interruption/recovery testing)
""" """
# runtime_services_dynamic_ports handles NATS and etcd startup # runtime_services_dynamic_ports handles NATS and etcd startup
nats_process, _etcd_process = runtime_services_dynamic_ports nats_process, _etcd_process = runtime_services_dynamic_ports
logger.info( logger.info(
f"Starting TRT-LLM indexers sync test: store_backend={store_backend}, " f"Starting TRT-LLM indexers sync test: store_backend={store_backend}, "
f"use_nats_core={use_nats_core}, request_plane={request_plane}" f"durable_kv_events={durable_kv_events}, request_plane={request_plane}"
) )
N_TRTLLM_WORKERS = 2 N_TRTLLM_WORKERS = 2
...@@ -531,13 +530,14 @@ def test_trtllm_indexers_sync( ...@@ -531,13 +530,14 @@ def test_trtllm_indexers_sync(
single_gpu=True, # fit workers into one GPU single_gpu=True, # fit workers into one GPU
request_plane=request_plane, request_plane=request_plane,
store_backend=store_backend, store_backend=store_backend,
enable_local_indexer=use_nats_core, durable_kv_events=durable_kv_events,
) )
logger.info(f"All TRT-LLM workers using namespace: {trtllm_workers.namespace}") logger.info(f"All TRT-LLM workers using namespace: {trtllm_workers.namespace}")
trtllm_workers.__enter__() trtllm_workers.__enter__()
# Use the common test implementation (creates its own runtimes for each router) # Use the common test implementation (creates its own runtimes for each router)
# Note: Consumer verification is done inside _test_router_indexers_sync while routers are alive # Note: Consumer verification is done inside _test_router_indexers_sync while routers are alive
# When using durable_kv_events=True, use JetStream mode for the router
_test_router_indexers_sync( _test_router_indexers_sync(
engine_workers=trtllm_workers, engine_workers=trtllm_workers,
block_size=TRTLLM_BLOCK_SIZE, block_size=TRTLLM_BLOCK_SIZE,
...@@ -545,8 +545,9 @@ def test_trtllm_indexers_sync( ...@@ -545,8 +545,9 @@ def test_trtllm_indexers_sync(
num_workers=N_TRTLLM_WORKERS, num_workers=N_TRTLLM_WORKERS,
store_backend=store_backend, store_backend=store_backend,
request_plane=request_plane, request_plane=request_plane,
test_nats_interruption=use_nats_core, test_nats_interruption=not durable_kv_events,
nats_server=nats_process if use_nats_core else None, nats_server=nats_process if not durable_kv_events else None,
durable_kv_events=durable_kv_events,
) )
logger.info("TRT-LLM indexers sync test completed successfully") logger.info("TRT-LLM indexers sync test completed successfully")
......
...@@ -87,7 +87,7 @@ class VLLMProcess: ...@@ -87,7 +87,7 @@ class VLLMProcess:
data_parallel_size: Optional[int] = None, data_parallel_size: Optional[int] = None,
request_plane: str = "tcp", request_plane: str = "tcp",
store_backend: str = "etcd", store_backend: str = "etcd",
enable_local_indexer: bool = False, durable_kv_events: bool = False,
): ):
"""Initialize vLLM workers with dynamo integration. """Initialize vLLM workers with dynamo integration.
...@@ -105,7 +105,7 @@ class VLLMProcess: ...@@ -105,7 +105,7 @@ class VLLMProcess:
data_parallel_size: If set, enables data parallelism with this many ranks (num_workers must equal data_parallel_size) data_parallel_size: If set, enables data parallelism with this many ranks (num_workers must equal data_parallel_size)
request_plane: Request plane to use ("nats", "tcp", or "http"). Defaults to "tcp". request_plane: Request plane to use ("nats", "tcp", or "http"). Defaults to "tcp".
store_backend: Storage backend to use ("etcd" or "file"). Defaults to "etcd". store_backend: Storage backend to use ("etcd" or "file"). Defaults to "etcd".
enable_local_indexer: If True, enable worker-local KV indexer for NATS Core mode. Defaults to False. durable_kv_events: If True, use JetStream for durable KV events. Defaults to False (NATS Core mode).
""" """
# Generate unique namespace for isolation # Generate unique namespace for isolation
namespace_suffix = generate_random_suffix() namespace_suffix = generate_random_suffix()
...@@ -197,6 +197,10 @@ class VLLMProcess: ...@@ -197,6 +197,10 @@ class VLLMProcess:
] ]
) )
# Use --durable-kv-events to enable JetStream mode (local indexer disabled)
if durable_kv_events:
command.append("--durable-kv-events")
env = os.environ.copy() # Copy parent environment env = os.environ.copy() # Copy parent environment
env_vars = { env_vars = {
"CUDA_VISIBLE_DEVICES": gpu_device, "CUDA_VISIBLE_DEVICES": gpu_device,
...@@ -211,10 +215,6 @@ class VLLMProcess: ...@@ -211,10 +215,6 @@ class VLLMProcess:
if self.store_backend == "file" and "DYN_FILE_KV" in os.environ: if self.store_backend == "file" and "DYN_FILE_KV" in os.environ:
env_vars["DYN_FILE_KV"] = os.environ["DYN_FILE_KV"] env_vars["DYN_FILE_KV"] = os.environ["DYN_FILE_KV"]
# Enable local indexer for NATS Core mode
if enable_local_indexer:
env_vars["DYN_LOCAL_INDEXER"] = "true"
env.update(env_vars) env.update(env_vars)
# Create managed process for the worker # Create managed process for the worker
...@@ -487,13 +487,12 @@ def test_router_decisions_vllm_dp( ...@@ -487,13 +487,12 @@ def test_router_decisions_vllm_dp(
@pytest.mark.gpu_1 @pytest.mark.gpu_1
@pytest.mark.timeout(150) # ~3x average (~43s/test), rounded up @pytest.mark.timeout(150) # ~3x average (~43s/test), rounded up
@pytest.mark.parametrize( @pytest.mark.parametrize(
"store_backend,use_nats_core,request_plane", "store_backend,durable_kv_events,request_plane",
[ [
("etcd", False, "nats"), # JetStream mode ("etcd", False, "tcp"),
# ("etcd", True, "tcp"), # nats_core mode - disabled for now
# ("file", False, "nats"), # File backend
], ],
ids=["jetstream"], ids=["nats_core"],
indirect=["durable_kv_events", "request_plane"],
) )
def test_vllm_indexers_sync( def test_vllm_indexers_sync(
request, request,
...@@ -502,7 +501,7 @@ def test_vllm_indexers_sync( ...@@ -502,7 +501,7 @@ def test_vllm_indexers_sync(
file_storage_backend, file_storage_backend,
set_ucx_tls_no_mm, set_ucx_tls_no_mm,
store_backend, store_backend,
use_nats_core, durable_kv_events,
request_plane, request_plane,
): ):
""" """
...@@ -510,15 +509,15 @@ def test_vllm_indexers_sync( ...@@ -510,15 +509,15 @@ def test_vllm_indexers_sync(
with vLLM workers. This test verifies that both routers converge to the same internal state. with vLLM workers. This test verifies that both routers converge to the same internal state.
Tests with configuration: Tests with configuration:
- jetstream: etcd backend, JetStream for KV events, NATS request plane - nats_core: etcd backend, local indexer with NATS Core, TCP request plane
- tcp_nats_core: etcd backend, local indexer with NATS Core, TCP request plane (includes NATS interruption/recovery testing)
""" """
# runtime_services_dynamic_ports handles NATS and etcd startup # runtime_services_dynamic_ports handles NATS and etcd startup
nats_process, _etcd_process = runtime_services_dynamic_ports nats_process, _etcd_process = runtime_services_dynamic_ports
logger.info( logger.info(
f"Starting vLLM indexers sync test: store_backend={store_backend}, " f"Starting vLLM indexers sync test: store_backend={store_backend}, "
f"use_nats_core={use_nats_core}, request_plane={request_plane}" f"durable_kv_events={durable_kv_events}, request_plane={request_plane}"
) )
N_VLLM_WORKERS = 2 N_VLLM_WORKERS = 2
...@@ -533,13 +532,14 @@ def test_vllm_indexers_sync( ...@@ -533,13 +532,14 @@ def test_vllm_indexers_sync(
single_gpu=True, # fit workers into one GPU single_gpu=True, # fit workers into one GPU
request_plane=request_plane, request_plane=request_plane,
store_backend=store_backend, store_backend=store_backend,
enable_local_indexer=use_nats_core, durable_kv_events=durable_kv_events,
) )
logger.info(f"All vLLM workers using namespace: {vllm_workers.namespace}") logger.info(f"All vLLM workers using namespace: {vllm_workers.namespace}")
vllm_workers.__enter__() vllm_workers.__enter__()
# Use the common test implementation (creates its own runtimes for each router) # Use the common test implementation (creates its own runtimes for each router)
# Note: Consumer verification is done inside _test_router_indexers_sync while routers are alive # Note: Consumer verification is done inside _test_router_indexers_sync while routers are alive
# When using durable_kv_events=True, use JetStream mode for the router
_test_router_indexers_sync( _test_router_indexers_sync(
engine_workers=vllm_workers, engine_workers=vllm_workers,
block_size=BLOCK_SIZE, block_size=BLOCK_SIZE,
...@@ -547,8 +547,9 @@ def test_vllm_indexers_sync( ...@@ -547,8 +547,9 @@ def test_vllm_indexers_sync(
num_workers=N_VLLM_WORKERS, num_workers=N_VLLM_WORKERS,
store_backend=store_backend, store_backend=store_backend,
request_plane=request_plane, request_plane=request_plane,
test_nats_interruption=use_nats_core, test_nats_interruption=not durable_kv_events,
nats_server=nats_process if use_nats_core else None, nats_server=nats_process if not durable_kv_events else None,
durable_kv_events=durable_kv_events,
) )
logger.info("vLLM indexers sync test completed successfully") logger.info("vLLM indexers sync test completed successfully")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment