Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
b6596c52
Unverified
Commit
b6596c52
authored
Mar 15, 2026
by
Yan Ru Pei
Committed by
GitHub
Mar 15, 2026
Browse files
chore(kv-router): remove native kv-indexer binary, use maturin-built one (#7338)
Signed-off-by:
PeaBrane
<
yanrpei@gmail.com
>
parent
9e9ca3e2
Changes
23
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
57 additions
and
405 deletions
+57
-405
lib/kv-router/src/standalone_indexer/server.rs
lib/kv-router/src/standalone_indexer/server.rs
+32
-32
lib/llm/Cargo.toml
lib/llm/Cargo.toml
+1
-2
tests/router/test_router_e2e_with_mockers.py
tests/router/test_router_e2e_with_mockers.py
+24
-371
No files found.
lib/kv-router/src/standalone_indexer/server.rs
View file @
b6596c52
...
...
@@ -15,7 +15,7 @@ use serde::{Deserialize, Serialize};
use
crate
::
protocols
::{
LocalBlockHash
,
WorkerId
,
compute_block_hash_for_seq
};
use
super
::
registry
::{
IndexerKey
,
WorkerRegistry
};
use
super
::
registry
::{
IndexerKey
,
ListenerControlError
,
WorkerRegistry
};
pub
struct
AppState
{
pub
registry
:
Arc
<
WorkerRegistry
>
,
...
...
@@ -51,12 +51,6 @@ pub struct UnregisterRequest {
pub
dp_rank
:
Option
<
u32
>
,
}
#[derive(Serialize)]
struct
WorkerInfo
{
instance_id
:
WorkerId
,
endpoints
:
HashMap
<
u32
,
String
>
,
}
#[derive(Deserialize)]
pub
struct
QueryRequest
{
pub
token_ids
:
Vec
<
u32
>
,
...
...
@@ -86,6 +80,15 @@ async fn register(
State
(
state
):
State
<
Arc
<
AppState
>>
,
Json
(
req
):
Json
<
RegisterRequest
>
,
)
->
impl
IntoResponse
{
if
let
Err
(
error
)
=
super
::
validate_listener_endpoints
(
&
req
.endpoint
,
req
.replay_endpoint
.as_deref
())
{
return
(
StatusCode
::
BAD_REQUEST
,
Json
(
serde_json
::
json!
({
"error"
:
error
.to_string
()})),
);
}
match
state
.registry
.register
(
...
...
@@ -146,16 +149,7 @@ async fn unregister(
}
async
fn
list_workers
(
State
(
state
):
State
<
Arc
<
AppState
>>
)
->
impl
IntoResponse
{
let
workers
:
Vec
<
WorkerInfo
>
=
state
.registry
.list
()
.into_iter
()
.map
(|(
instance_id
,
endpoints
)|
WorkerInfo
{
instance_id
,
endpoints
,
})
.collect
();
Json
(
workers
)
Json
(
state
.registry
.list
())
}
fn
build_score_response
(
...
...
@@ -254,7 +248,6 @@ async fn query_by_hash(
}
}
#[cfg(feature
=
"test-endpoints"
)]
#[derive(Deserialize)]
struct
ListenerControlRequest
{
instance_id
:
WorkerId
,
...
...
@@ -262,7 +255,6 @@ struct ListenerControlRequest {
dp_rank
:
Option
<
u32
>
,
}
#[cfg(feature
=
"test-endpoints"
)]
async
fn
test_pause_listener
(
State
(
state
):
State
<
Arc
<
AppState
>>
,
Json
(
req
):
Json
<
ListenerControlRequest
>
,
...
...
@@ -272,14 +264,10 @@ async fn test_pause_listener(
.pause_listener
(
req
.instance_id
,
req
.dp_rank
.unwrap_or
(
0
))
{
Ok
(())
=>
(
StatusCode
::
OK
,
Json
(
serde_json
::
json!
({
"status"
:
"ok"
}))),
Err
(
e
)
=>
(
StatusCode
::
NOT_FOUND
,
Json
(
serde_json
::
json!
({
"error"
:
e
.to_string
()})),
),
Err
(
error
)
=>
listener_control_error_response
(
error
),
}
}
#[cfg(feature
=
"test-endpoints"
)]
async
fn
test_resume_listener
(
State
(
state
):
State
<
Arc
<
AppState
>>
,
Json
(
req
):
Json
<
ListenerControlRequest
>
,
...
...
@@ -290,13 +278,26 @@ async fn test_resume_listener(
.await
{
Ok
(())
=>
(
StatusCode
::
OK
,
Json
(
serde_json
::
json!
({
"status"
:
"ok"
}))),
Err
(
e
)
=>
(
StatusCode
::
CONFLICT
,
Json
(
serde_json
::
json!
({
"error"
:
e
.to_string
()})),
),
Err
(
error
)
=>
listener_control_error_response
(
error
),
}
}
fn
listener_control_error_response
(
error
:
ListenerControlError
,
)
->
(
StatusCode
,
Json
<
serde_json
::
Value
>
)
{
let
status
=
match
&
error
{
ListenerControlError
::
WorkerNotFound
{
..
}
|
ListenerControlError
::
ListenerNotFound
{
..
}
=>
StatusCode
::
NOT_FOUND
,
ListenerControlError
::
DiscoveryManaged
{
..
}
|
ListenerControlError
::
InvalidPauseState
{
..
}
|
ListenerControlError
::
InvalidResumeState
{
..
}
=>
StatusCode
::
CONFLICT
,
};
(
status
,
Json
(
serde_json
::
json!
({
"error"
:
error
.to_string
()})),
)
}
#[derive(Deserialize)]
struct
PeerRequest
{
url
:
String
,
...
...
@@ -373,6 +374,7 @@ async fn handle_health() -> StatusCode {
#[cfg(feature
=
"metrics"
)]
async
fn
handle_metrics
(
State
(
state
):
State
<
Arc
<
AppState
>>
)
->
impl
IntoResponse
{
state
.registry
.refresh_metrics
();
let
encoder
=
prometheus
::
TextEncoder
::
new
();
let
mut
buf
=
Vec
::
new
();
encoder
...
...
@@ -401,12 +403,10 @@ pub fn create_router(state: Arc<AppState>) -> Router {
.route
(
"/peers"
,
get
(
list_peers
))
.route
(
"/health"
,
get
(
handle_health
));
#[cfg(feature
=
"test-endpoints"
)]
let
router
=
router
.route
(
"/test/pause_listener"
,
post
(
test_pause_listener
))
.route
(
"/test/resume_listener"
,
post
(
test_resume_listener
));
let
router
=
router
.with_state
(
state
.clone
());
.route
(
"/test/resume_listener"
,
post
(
test_resume_listener
))
.with_state
(
state
.clone
());
#[cfg(feature
=
"metrics"
)]
let
router
=
{
...
...
lib/llm/Cargo.toml
View file @
b6596c52
...
...
@@ -47,7 +47,7 @@ required-features = ["kv-router-stress"]
# repo
dynamo-runtime
=
{
workspace
=
true
}
dynamo-tokens
=
{
workspace
=
true
}
dynamo-kv-router
=
{
workspace
=
true
,
features
=
["metrics"]
}
dynamo-kv-router
=
{
workspace
=
true
,
features
=
[
"metrics"
,
"runtime-protocols"
]
}
dynamo-memory
=
{
workspace
=
true
}
dynamo-mocker
=
{
workspace
=
true
}
...
...
@@ -201,4 +201,3 @@ tonic-build = { version = "0.13.1" }
name
=
"bench_local_transfer_v2"
path
=
"bin/bench_local_transfer_v2.rs"
required-features
=
["block-manager-bench"]
tests/router/test_router_e2e_with_mockers.py
View file @
b6596c52
...
...
@@ -13,12 +13,10 @@
# endpoint_path (e.g., "generate") as the routing key, causing handler collisions when multiple
# workers register the same endpoint. This is a test-only limitation; production deployments
# with separate processes per worker work correctly with TCP.
import
asyncio
import
logging
import
os
from
typing
import
Any
,
Dict
,
Optional
import
aiohttp
import
pytest
from
tests.router.common
import
(
...
...
@@ -27,7 +25,6 @@ from tests.router.common import (
_test_router_basic
,
_test_router_decisions
,
_test_router_decisions_disagg
,
_test_router_indexers_sync
,
_test_router_overload_503
,
_test_router_query_instance_id
,
_test_router_two_routers
,
...
...
@@ -176,12 +173,7 @@ def _build_mocker_command(
class
MockerProcess
:
"""Manages mocker engine instances with shared tokio runtime via --num-workers.
When standalone_indexer=True, launches mockers one-by-one (each as --num-workers 1)
and runs a standalone HTTP KV indexer binary alongside them. Call launch_mockers_with_indexer()
in async context to start mockers and register their ZMQ ports with the indexer.
"""
"""Manages mocker engine instances with shared tokio runtime via --num-workers."""
def
__init__
(
self
,
...
...
@@ -191,7 +183,6 @@ class MockerProcess:
store_backend
:
str
=
"etcd"
,
request_plane
:
str
=
"nats"
,
zmq_kv_events
:
bool
=
False
,
standalone_indexer
:
bool
=
False
,
model_name
:
str
=
"mocker"
,
zmq_replay
:
bool
=
False
,
):
...
...
@@ -203,12 +194,6 @@ class MockerProcess:
self
.
num_workers
=
num_mockers
self
.
_zmq_kv_events_ports
:
list
[
int
]
=
[]
self
.
_zmq_replay_ports
:
list
[
int
]
=
[]
self
.
_standalone_indexer
=
standalone_indexer
self
.
_standalone_indexer_port
:
Optional
[
int
]
=
None
self
.
_standalone_indexer_b_port
:
Optional
[
int
]
=
None
self
.
_indexer_process
:
Optional
[
ManagedProcess
]
=
None
self
.
_indexer_b_process
:
Optional
[
ManagedProcess
]
=
None
self
.
_mocker_processes
:
list
[
ManagedProcess
]
=
[]
self
.
_request
=
request
self
.
_store_backend
=
store_backend
self
.
_request_plane
=
request_plane
...
...
@@ -231,8 +216,7 @@ class MockerProcess:
num_mockers
*
dp_size
,
BASE_PORT_ZMQ
)
bases
=
[
self
.
_zmq_kv_events_ports
[
i
*
dp_size
]
for
i
in
range
(
num_mockers
)]
if
not
standalone_indexer
:
mocker_args
[
"zmq_kv_events_ports"
]
=
","
.
join
(
str
(
p
)
for
p
in
bases
)
mocker_args
[
"zmq_kv_events_ports"
]
=
","
.
join
(
str
(
p
)
for
p
in
bases
)
logger
.
info
(
f
"Allocated ZMQ KV event ports
{
self
.
_zmq_kv_events_ports
}
"
f
"(bases:
{
bases
}
) for
{
num_mockers
}
workers"
...
...
@@ -247,285 +231,43 @@ class MockerProcess:
replay_bases
=
[
self
.
_zmq_replay_ports
[
i
*
dp_size
]
for
i
in
range
(
num_mockers
)
]
if
not
standalone_indexer
:
mocker_args
[
"zmq_replay_ports"
]
=
","
.
join
(
str
(
p
)
for
p
in
replay_bases
)
mocker_args
[
"zmq_replay_ports"
]
=
","
.
join
(
str
(
p
)
for
p
in
replay_bases
)
logger
.
info
(
f
"Allocated ZMQ replay ports
{
self
.
_zmq_replay_ports
}
"
f
"(bases:
{
replay_bases
}
) for
{
num_mockers
}
workers"
)
if
standalone_indexer
:
# Allocate ports for standalone indexer A and B (P2P recovery peer)
indexer_ports
=
allocate_ports
(
2
,
BASE_PORT
)
self
.
_standalone_indexer_port
=
indexer_ports
[
0
]
self
.
_standalone_indexer_b_port
=
indexer_ports
[
1
]
request
.
addfinalizer
(
lambda
:
deallocate_ports
(
indexer_ports
))
# Don't build a single mocker command — we'll launch per-mocker in launch_mockers_with_indexer
self
.
_process
=
None
else
:
command
=
_build_mocker_command
(
endpoint
=
self
.
endpoint
,
store_backend
=
store_backend
,
num_workers
=
num_mockers
,
mocker_args
=
mocker_args
,
)
env
=
os
.
environ
.
copy
()
env
[
"DYN_REQUEST_PLANE"
]
=
request_plane
self
.
_process
=
ManagedProcess
(
command
=
command
,
env
=
env
,
timeout
=
60
,
display_output
=
True
,
health_check_ports
=
[],
health_check_urls
=
[],
log_dir
=
request
.
node
.
name
,
terminate_all_matching_process_names
=
False
,
)
logger
.
info
(
f
"Created mocker process with
{
num_mockers
}
worker(s), endpoint:
{
self
.
endpoint
}
"
f
"
{
', standalone_indexer=True'
if
standalone_indexer
else
''
}
"
command
=
_build_mocker_command
(
endpoint
=
self
.
endpoint
,
store_backend
=
store_backend
,
num_workers
=
num_mockers
,
mocker_args
=
mocker_args
,
)
@
property
def
standalone_indexer_url
(
self
)
->
Optional
[
str
]:
if
self
.
_standalone_indexer_port
is
not
None
:
return
f
"http://localhost:
{
self
.
_standalone_indexer_port
}
"
return
None
@
property
def
standalone_indexer_b_url
(
self
)
->
Optional
[
str
]:
if
self
.
_standalone_indexer_b_port
is
not
None
:
return
f
"http://localhost:
{
self
.
_standalone_indexer_b_port
}
"
return
None
def
__enter__
(
self
):
if
self
.
_standalone_indexer
:
# Launch the standalone indexer binary
block_size
=
self
.
_mocker_args_orig
.
get
(
"block_size"
,
BLOCK_SIZE
)
indexer_cmd
=
[
"cargo"
,
"run"
,
"-p"
,
"dynamo-kv-router"
,
"--features"
,
"indexer-bin,test-endpoints"
,
"--bin"
,
"dynamo-kv-indexer"
,
"--"
,
"--block-size"
,
str
(
block_size
),
"--port"
,
str
(
self
.
_standalone_indexer_port
),
]
self
.
_indexer_process
=
ManagedProcess
(
command
=
indexer_cmd
,
timeout
=
120
,
display_output
=
True
,
health_check_ports
=
[
self
.
_standalone_indexer_port
],
health_check_urls
=
[],
log_dir
=
self
.
_request
.
node
.
name
,
terminate_all_matching_process_names
=
False
,
display_name
=
"dynamo-kv-indexer"
,
)
logger
.
info
(
f
"Starting standalone indexer on port
{
self
.
_standalone_indexer_port
}
"
)
self
.
_indexer_process
.
__enter__
()
# Don't start mocker processes yet — launch_mockers_with_indexer will do it
else
:
logger
.
info
(
f
"Starting mocker process with
{
self
.
num_workers
}
worker(s)"
)
self
.
_process
.
__enter__
()
return
self
async
def
launch_mockers_with_indexer
(
self
,
endpoint
):
"""Launch mockers one-by-one and register each with the standalone indexer.
For each mocker:
1. Launch a mocker process with --num-workers 1
2. Poll endpoint.client().instance_ids() until a new worker_id appears
3. POST /workers to the indexer with the worker_id and its ZMQ addresses
Args:
endpoint: The dynamo endpoint object to discover worker IDs.
"""
client
=
await
endpoint
.
client
()
known_ids
:
set
[
int
]
=
set
()
dp_size
=
self
.
_mocker_args_orig
.
get
(
"dp_size"
,
1
)
for
i
in
range
(
self
.
num_workers
):
# Build per-mocker args with its own ZMQ base port
mocker_args
=
self
.
_mocker_args_orig
.
copy
()
base_port
=
self
.
_zmq_kv_events_ports
[
i
*
dp_size
]
mocker_args
[
"zmq_kv_events_ports"
]
=
str
(
base_port
)
if
self
.
_zmq_replay_ports
:
replay_base
=
self
.
_zmq_replay_ports
[
i
*
dp_size
]
mocker_args
[
"zmq_replay_ports"
]
=
str
(
replay_base
)
command
=
_build_mocker_command
(
endpoint
=
self
.
endpoint
,
store_backend
=
self
.
_store_backend
,
num_workers
=
1
,
mocker_args
=
mocker_args
,
)
env
=
os
.
environ
.
copy
()
env
[
"DYN_REQUEST_PLANE"
]
=
self
.
_request_plane
proc
=
ManagedProcess
(
command
=
command
,
env
=
env
,
timeout
=
60
,
display_output
=
True
,
health_check_ports
=
[],
health_check_urls
=
[],
log_dir
=
self
.
_request
.
node
.
name
,
terminate_all_matching_process_names
=
False
,
display_name
=
f
"mocker-
{
i
}
"
,
)
proc
.
__enter__
()
self
.
_mocker_processes
.
append
(
proc
)
# Poll for the new worker_id
new_worker_id
=
None
for
_
in
range
(
120
):
ids
=
set
(
client
.
instance_ids
())
new
=
ids
-
known_ids
if
new
:
new_worker_id
=
new
.
pop
()
known_ids
.
add
(
new_worker_id
)
break
await
asyncio
.
sleep
(
0.5
)
if
new_worker_id
is
None
:
raise
RuntimeError
(
f
"Timed out waiting for mocker
{
i
}
to register "
f
"(known_ids=
{
known_ids
}
)"
)
# Register each dp_rank endpoint with the standalone indexer
zmq_addresses
=
{}
register_url
=
f
"
{
self
.
standalone_indexer_url
}
/register"
async
with
aiohttp
.
ClientSession
()
as
session
:
for
dp_rank
in
range
(
dp_size
):
port
=
self
.
_zmq_kv_events_ports
[
i
*
dp_size
+
dp_rank
]
endpoint
=
f
"tcp://127.0.0.1:
{
port
}
"
zmq_addresses
[
str
(
dp_rank
)]
=
endpoint
payload
=
{
"instance_id"
:
new_worker_id
,
"endpoint"
:
endpoint
,
"dp_rank"
:
dp_rank
,
"model_name"
:
self
.
model_name
,
"block_size"
:
self
.
_mocker_args_orig
.
get
(
"block_size"
,
BLOCK_SIZE
),
}
if
self
.
_zmq_replay_ports
:
replay_port
=
self
.
_zmq_replay_ports
[
i
*
dp_size
+
dp_rank
]
payload
[
"replay_endpoint"
]
=
f
"tcp://127.0.0.1:
{
replay_port
}
"
async
with
session
.
post
(
register_url
,
json
=
payload
)
as
resp
:
if
resp
.
status
!=
201
:
body
=
await
resp
.
text
()
raise
RuntimeError
(
f
"Failed to register instance
{
new_worker_id
}
"
f
"dp_rank
{
dp_rank
}
:
{
resp
.
status
}
{
body
}
"
)
self
.
worker_id_to_zmq_ports
[
new_worker_id
]
=
zmq_addresses
logger
.
info
(
f
"Mocker
{
i
}
: worker_id=
{
new_worker_id
}
, "
f
"zmq_addresses=
{
zmq_addresses
}
"
)
logger
.
info
(
f
"All
{
self
.
num_workers
}
mockers launched and registered with indexer"
)
env
=
os
.
environ
.
copy
()
env
[
"DYN_REQUEST_PLANE"
]
=
request_plane
def
launch_indexer
(
self
):
"""Launch a second standalone indexer (Indexer B) with --peers pointing to Indexer A.
Workers are passed via --workers so ZMQ sockets connect before recovery
runs, ensuring the subscription handshake completes during the recovery
delay and no events are lost to the ZMQ slow-joiner problem.
"""
if
not
self
.
_standalone_indexer
or
self
.
_standalone_indexer_b_port
is
None
:
raise
RuntimeError
(
"launch_indexer requires standalone_indexer=True"
)
if
not
self
.
worker_id_to_zmq_ports
:
raise
RuntimeError
(
"launch_indexer requires workers to be registered first"
)
block_size
=
self
.
_mocker_args_orig
.
get
(
"block_size"
,
BLOCK_SIZE
)
# Build --workers arg: "worker_id:dp_rank=zmq_addr,..."
worker_entries
=
[]
for
worker_id
,
zmq_addresses
in
self
.
worker_id_to_zmq_ports
.
items
():
for
dp_rank_str
,
zmq_endpoint
in
zmq_addresses
.
items
():
worker_entries
.
append
(
f
"
{
worker_id
}
:
{
dp_rank_str
}
=
{
zmq_endpoint
}
"
)
workers_arg
=
","
.
join
(
worker_entries
)
indexer_b_cmd
=
[
"cargo"
,
"run"
,
"-p"
,
"dynamo-kv-router"
,
"--features"
,
"indexer-bin,test-endpoints"
,
"--bin"
,
"dynamo-kv-indexer"
,
"--"
,
"--block-size"
,
str
(
block_size
),
"--port"
,
str
(
self
.
_standalone_indexer_b_port
),
"--peers"
,
f
"http://localhost:
{
self
.
_standalone_indexer_port
}
"
,
"--workers"
,
workers_arg
,
"--model-name"
,
self
.
model_name
,
]
self
.
_indexer_b_process
=
ManagedProcess
(
command
=
indexer_b_cmd
,
timeout
=
120
,
self
.
_process
=
ManagedProcess
(
command
=
command
,
env
=
env
,
timeout
=
60
,
display_output
=
True
,
health_check_ports
=
[
self
.
_standalone_indexer_b_port
],
health_check_ports
=
[],
health_check_urls
=
[],
log_dir
=
self
.
_
request
.
node
.
name
,
log_dir
=
request
.
node
.
name
,
terminate_all_matching_process_names
=
False
,
display_name
=
"dynamo-kv-indexer-b"
,
)
logger
.
info
(
f
"Starting standalone indexer B on port
{
self
.
_standalone_indexer_b_port
}
"
f
"with peer http://localhost:
{
self
.
_standalone_indexer_port
}
"
f
"Created mocker process with
{
num_mockers
}
worker(s), endpoint:
{
self
.
endpoint
}
"
)
self
.
_indexer_b_process
.
__enter__
()
def
__enter__
(
self
):
logger
.
info
(
f
"Starting mocker process with
{
self
.
num_workers
}
worker(s)"
)
self
.
_process
.
__enter__
()
return
self
def
__exit__
(
self
,
exc_type
,
exc_val
,
exc_tb
):
logger
.
info
(
"Stopping mocker process(es)"
)
# Stop individual mocker processes (standalone_indexer mode)
for
proc
in
self
.
_mocker_processes
:
try
:
proc
.
__exit__
(
exc_type
,
exc_val
,
exc_tb
)
except
Exception
as
e
:
logger
.
warning
(
f
"Error stopping mocker process:
{
e
}
"
)
self
.
_mocker_processes
.
clear
()
# Stop standalone indexer B (P2P recovery peer)
if
self
.
_indexer_b_process
is
not
None
:
try
:
self
.
_indexer_b_process
.
__exit__
(
exc_type
,
exc_val
,
exc_tb
)
except
Exception
as
e
:
logger
.
warning
(
f
"Error stopping indexer B process:
{
e
}
"
)
self
.
_indexer_b_process
=
None
# Stop standalone indexer A
if
self
.
_indexer_process
is
not
None
:
try
:
self
.
_indexer_process
.
__exit__
(
exc_type
,
exc_val
,
exc_tb
)
except
Exception
as
e
:
logger
.
warning
(
f
"Error stopping indexer process:
{
e
}
"
)
self
.
_indexer_process
=
None
# Stop single mocker process (non-standalone mode)
logger
.
info
(
"Stopping mocker process"
)
if
self
.
_process
is
not
None
:
self
.
_process
.
__exit__
(
exc_type
,
exc_val
,
exc_tb
)
if
self
.
_zmq_kv_events_ports
:
...
...
@@ -844,92 +586,6 @@ def test_kv_router_bindings(
)
@
pytest
.
mark
.
parametrize
(
"store_backend,durable_kv_events,request_plane"
,
[
(
"etcd"
,
True
,
"nats"
),
# JetStream mode - uses JetStream
(
"etcd"
,
False
,
"tcp"
),
# NATS core mode (with gap detection) - no JetStream
(
"file"
,
True
,
"nats"
),
# File backend - uses JetStream
],
ids
=
[
"jetstream"
,
"nats_core"
,
"file"
,
],
indirect
=
[
"request_plane"
,
"durable_kv_events"
],
)
@
pytest
.
mark
.
timeout
(
300
)
def
test_indexers_sync
(
request
,
runtime_services_dynamic_ports
,
predownload_tokenizers
,
file_storage_backend
,
store_backend
,
durable_kv_events
,
request_plane
,
):
"""
Test that two KV routers have synchronized indexer states after processing requests.
This test verifies that both routers converge to the same internal state.
Tests with three configurations:
- jetstream: etcd backend, JetStream for KV events, NATS request plane
- nats_core: etcd backend, NATS Core with gap detection, TCP request plane
- file: file backend, JetStream for KV events, NATS request plane
"""
logger
.
info
(
f
"Starting indexers sync test: store_backend=
{
store_backend
}
, "
f
"durable_kv_events=
{
durable_kv_events
}
, request_plane=
{
request_plane
}
"
)
# Use the dynamic-port fixture to avoid hardcoded localhost:4222/2379 in parallel runs.
nats_process
,
_etcd_process
=
runtime_services_dynamic_ports
# Create mocker args dictionary
# Use 2 DP ranks to test per-dp_rank event ID tracking and recovery
mocker_args
=
{
"speedup_ratio"
:
SPEEDUP_RATIO
,
"block_size"
:
BLOCK_SIZE
,
"durable_kv_events"
:
durable_kv_events
,
"dp_size"
:
2
,
}
with
MockerProcess
(
request
,
mocker_args
=
mocker_args
,
num_mockers
=
NUM_MOCKERS
,
store_backend
=
store_backend
,
request_plane
=
request_plane
,
zmq_kv_events
=
True
,
zmq_replay
=
True
,
standalone_indexer
=
True
,
model_name
=
MODEL_NAME
,
)
as
mockers
:
# Start mocker instances (2 workers x 2 DP ranks = 4 independent event streams)
logger
.
info
(
f
"Starting
{
NUM_MOCKERS
}
mocker instances with dp_size=2"
)
logger
.
info
(
f
"All mockers using endpoint:
{
mockers
.
endpoint
}
"
)
# Use the common test implementation (creates its own runtimes for each router)
# Note: Consumer verification is done inside _test_router_indexers_sync while routers are alive
# When using durable_kv_events=True, use JetStream mode for the router
_test_router_indexers_sync
(
engine_workers
=
mockers
,
block_size
=
BLOCK_SIZE
,
model_name
=
MODEL_NAME
,
num_workers
=
NUM_MOCKERS
,
store_backend
=
store_backend
,
request_plane
=
request_plane
,
test_nats_interruption
=
not
durable_kv_events
,
nats_server
=
nats_process
if
not
durable_kv_events
else
None
,
durable_kv_events
=
durable_kv_events
,
standalone_indexer_url
=
mockers
.
standalone_indexer_url
,
standalone_indexer_b_url
=
mockers
.
standalone_indexer_b_url
,
test_zmq_replay
=
True
,
)
logger
.
info
(
"Indexers sync test completed successfully"
)
@
pytest
.
mark
.
timeout
(
120
)
# bumped for xdist contention (was 42s; ~13.80s serial avg)
@
pytest
.
mark
.
parametrize
(
"durable_kv_events"
,
[
False
],
ids
=
[
"nondurable"
],
indirect
=
True
...
...
@@ -974,9 +630,8 @@ def test_query_instance_id_returns_worker_and_tokens(
(
True
,
True
,
False
),
# JetStream mode with KV events
(
False
,
True
,
False
),
# NATS Core mode with local indexer (default)
(
False
,
False
,
False
),
# Approximate mode (--no-kv-events) - no KV events
(
False
,
True
,
True
),
# ZMQ mode: mocker → ZMQ PUB → relay → NATS
],
ids
=
[
"jetstream"
,
"nats_core"
,
"no_kv_events"
,
"zmq"
],
ids
=
[
"jetstream"
,
"nats_core"
,
"no_kv_events"
],
indirect
=
[
"durable_kv_events"
],
)
def
test_router_decisions
(
...
...
@@ -1016,7 +671,6 @@ def test_router_decisions(
num_mockers
=
2
,
request_plane
=
request_plane
,
zmq_kv_events
=
zmq_kv_events
,
standalone_indexer
=
zmq_kv_events
,
model_name
=
MODEL_NAME
,
)
as
mockers
:
logger
.
info
(
f
"All mockers using endpoint:
{
mockers
.
endpoint
}
"
)
...
...
@@ -1035,7 +689,6 @@ def test_router_decisions(
test_dp_rank
=
True
,
use_kv_events
=
use_kv_events
,
durable_kv_events
=
durable_kv_events
,
standalone_indexer_url
=
mockers
.
standalone_indexer_url
,
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment