"vscode:/vscode.git/clone" did not exist on "100819299f6b7fa55b8b56e368ddc9f863a9fb48"
Unverified Commit 69797b5a authored by Graham King's avatar Graham King Committed by GitHub
Browse files

feat: Only monitor NATS metrics if using NATS request plane (#4442)


Signed-off-by: default avatarGraham King <grahamk@nvidia.com>
parent a8e5328e
...@@ -40,7 +40,6 @@ async def init_planner(runtime: DistributedRuntime, args): ...@@ -40,7 +40,6 @@ async def init_planner(runtime: DistributedRuntime, args):
await start_sla_planner(runtime, args) await start_sla_planner(runtime, args)
component = runtime.namespace(args.namespace).component("Planner") component = runtime.namespace(args.namespace).component("Planner")
await component.create_service()
async def generate(request: RequestType): async def generate(request: RequestType):
"""Dummy endpoint to satisfy that each component has an endpoint""" """Dummy endpoint to satisfy that each component has an endpoint"""
......
...@@ -260,7 +260,6 @@ async def worker(runtime: DistributedRuntime): ...@@ -260,7 +260,6 @@ async def worker(runtime: DistributedRuntime):
# Create service component - use "router" as component name # Create service component - use "router" as component name
component = runtime.namespace(namespace).component("router") component = runtime.namespace(namespace).component("router")
await component.create_service()
# Create handler # Create handler
handler = StandaloneRouterHandler( handler = StandaloneRouterHandler(
......
...@@ -110,7 +110,6 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -110,7 +110,6 @@ async def init(runtime: DistributedRuntime, config: Config):
component = runtime.namespace(dynamo_args.namespace).component( component = runtime.namespace(dynamo_args.namespace).component(
dynamo_args.component dynamo_args.component
) )
await component.create_service()
generate_endpoint = component.endpoint(dynamo_args.endpoint) generate_endpoint = component.endpoint(dynamo_args.endpoint)
...@@ -197,7 +196,6 @@ async def init_prefill(runtime: DistributedRuntime, config: Config): ...@@ -197,7 +196,6 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
component = runtime.namespace(dynamo_args.namespace).component( component = runtime.namespace(dynamo_args.namespace).component(
dynamo_args.component dynamo_args.component
) )
await component.create_service()
generate_endpoint = component.endpoint(dynamo_args.endpoint) generate_endpoint = component.endpoint(dynamo_args.endpoint)
...@@ -257,7 +255,6 @@ async def init_embedding(runtime: DistributedRuntime, config: Config): ...@@ -257,7 +255,6 @@ async def init_embedding(runtime: DistributedRuntime, config: Config):
component = runtime.namespace(dynamo_args.namespace).component( component = runtime.namespace(dynamo_args.namespace).component(
dynamo_args.component dynamo_args.component
) )
await component.create_service()
generate_endpoint = component.endpoint(dynamo_args.endpoint) generate_endpoint = component.endpoint(dynamo_args.endpoint)
...@@ -315,7 +312,6 @@ async def init_multimodal_processor(runtime: DistributedRuntime, config: Config) ...@@ -315,7 +312,6 @@ async def init_multimodal_processor(runtime: DistributedRuntime, config: Config)
component = runtime.namespace(dynamo_args.namespace).component( component = runtime.namespace(dynamo_args.namespace).component(
dynamo_args.component dynamo_args.component
) )
await component.create_service()
generate_endpoint = component.endpoint(dynamo_args.endpoint) generate_endpoint = component.endpoint(dynamo_args.endpoint)
...@@ -364,7 +360,6 @@ async def init_multimodal_encode_worker(runtime: DistributedRuntime, config: Con ...@@ -364,7 +360,6 @@ async def init_multimodal_encode_worker(runtime: DistributedRuntime, config: Con
component = runtime.namespace(dynamo_args.namespace).component( component = runtime.namespace(dynamo_args.namespace).component(
dynamo_args.component dynamo_args.component
) )
await component.create_service()
generate_endpoint = component.endpoint(dynamo_args.endpoint) generate_endpoint = component.endpoint(dynamo_args.endpoint)
...@@ -405,7 +400,6 @@ async def init_multimodal_worker(runtime: DistributedRuntime, config: Config): ...@@ -405,7 +400,6 @@ async def init_multimodal_worker(runtime: DistributedRuntime, config: Config):
component = runtime.namespace(dynamo_args.namespace).component( component = runtime.namespace(dynamo_args.namespace).component(
dynamo_args.component dynamo_args.component
) )
await component.create_service()
generate_endpoint = component.endpoint(dynamo_args.endpoint) generate_endpoint = component.endpoint(dynamo_args.endpoint)
...@@ -447,7 +441,6 @@ async def init_multimodal_prefill_worker(runtime: DistributedRuntime, config: Co ...@@ -447,7 +441,6 @@ async def init_multimodal_prefill_worker(runtime: DistributedRuntime, config: Co
component = runtime.namespace(dynamo_args.namespace).component( component = runtime.namespace(dynamo_args.namespace).component(
dynamo_args.component dynamo_args.component
) )
await component.create_service()
generate_endpoint = component.endpoint(dynamo_args.endpoint) generate_endpoint = component.endpoint(dynamo_args.endpoint)
......
...@@ -143,7 +143,6 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -143,7 +143,6 @@ async def init(runtime: DistributedRuntime, config: Config):
) )
component = runtime.namespace(config.namespace).component(config.component) component = runtime.namespace(config.namespace).component(config.component)
await component.create_service()
# Convert model path to Path object if it's a local path, otherwise keep as string # Convert model path to Path object if it's a local path, otherwise keep as string
model_path = str(config.model_path) model_path = str(config.model_path)
......
...@@ -326,7 +326,6 @@ async def init_prefill(runtime: DistributedRuntime, config: Config): ...@@ -326,7 +326,6 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
Instantiate and serve Instantiate and serve
""" """
component = runtime.namespace(config.namespace).component(config.component) component = runtime.namespace(config.namespace).component(config.component)
await component.create_service()
generate_endpoint = component.endpoint(config.endpoint) generate_endpoint = component.endpoint(config.endpoint)
clear_endpoint = component.endpoint("clear_kv_blocks") clear_endpoint = component.endpoint("clear_kv_blocks")
...@@ -427,7 +426,6 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -427,7 +426,6 @@ async def init(runtime: DistributedRuntime, config: Config):
""" """
component = runtime.namespace(config.namespace).component(config.component) component = runtime.namespace(config.namespace).component(config.component)
await component.create_service()
generate_endpoint = component.endpoint(config.endpoint) generate_endpoint = component.endpoint(config.endpoint)
clear_endpoint = component.endpoint("clear_kv_blocks") clear_endpoint = component.endpoint("clear_kv_blocks")
...@@ -558,7 +556,6 @@ def get_engine_cache_info(engine: AsyncLLM): ...@@ -558,7 +556,6 @@ def get_engine_cache_info(engine: AsyncLLM):
async def init_multimodal_processor(runtime: DistributedRuntime, config: Config): async def init_multimodal_processor(runtime: DistributedRuntime, config: Config):
"""Initialize multimodal processor component""" """Initialize multimodal processor component"""
component = runtime.namespace(config.namespace).component(config.component) component = runtime.namespace(config.namespace).component(config.component)
await component.create_service()
generate_endpoint = component.endpoint(config.endpoint) generate_endpoint = component.endpoint(config.endpoint)
...@@ -610,7 +607,6 @@ async def init_multimodal_processor(runtime: DistributedRuntime, config: Config) ...@@ -610,7 +607,6 @@ async def init_multimodal_processor(runtime: DistributedRuntime, config: Config)
async def init_multimodal_encode_worker(runtime: DistributedRuntime, config: Config): async def init_multimodal_encode_worker(runtime: DistributedRuntime, config: Config):
"""Initialize multimodal encode worker component""" """Initialize multimodal encode worker component"""
component = runtime.namespace(config.namespace).component(config.component) component = runtime.namespace(config.namespace).component(config.component)
await component.create_service()
generate_endpoint = component.endpoint(config.endpoint) generate_endpoint = component.endpoint(config.endpoint)
...@@ -657,7 +653,6 @@ async def init_multimodal_worker(runtime: DistributedRuntime, config: Config): ...@@ -657,7 +653,6 @@ async def init_multimodal_worker(runtime: DistributedRuntime, config: Config):
Both can operate in aggregated (P+D) or disaggregated (P→D) mode. Both can operate in aggregated (P+D) or disaggregated (P→D) mode.
""" """
component = runtime.namespace(config.namespace).component(config.component) component = runtime.namespace(config.namespace).component(config.component)
await component.create_service()
generate_endpoint = component.endpoint(config.endpoint) generate_endpoint = component.endpoint(config.endpoint)
clear_endpoint = component.endpoint("clear_kv_blocks") clear_endpoint = component.endpoint("clear_kv_blocks")
......
...@@ -53,7 +53,7 @@ The hierarchy and naming in etcd and NATS may change over time, and this documen ...@@ -53,7 +53,7 @@ The hierarchy and naming in etcd and NATS may change over time, and this documen
For etcd, it also creates a primary lease and spin up a background task to keep the lease alive. All objects registered under this `DistributedRuntime` use this lease_id to maintain their life cycle. There is also a cancellation token that is tied to the primary lease. When the cancellation token is triggered or the background task failed, the primary lease is revoked or expired and the kv pairs stored with this lease_id is removed. For etcd, it also creates a primary lease and spin up a background task to keep the lease alive. All objects registered under this `DistributedRuntime` use this lease_id to maintain their life cycle. There is also a cancellation token that is tied to the primary lease. When the cancellation token is triggered or the background task failed, the primary lease is revoked or expired and the kv pairs stored with this lease_id is removed.
- `Namespace`: `Namespace`s are primarily a logical grouping mechanism and is not registered in etcd. It provides the root path for all components under this `Namespace`. - `Namespace`: `Namespace`s are primarily a logical grouping mechanism and is not registered in etcd. It provides the root path for all components under this `Namespace`.
- `Component`: When a `Component` object is created, similar to `Namespace`, it isn't be registered in etcd. When `create_service` is called, it creates a NATS service group using `{namespace_name}.{service_name}` and registers a service in the registry of the `Component`, where the registry is an internal data structure that tracks all services and endpoints within the `DistributedRuntime`. - `Component`: When a `Component` object is created, similar to `Namespace`, it isn't be registered in etcd. When `create_service` is called, it creates a NATS service group using `{namespace_name}.{service_name}` for metrics and registers a service in the registry of the `Component`, where the registry is an internal data structure that tracks all services and endpoints within the `DistributedRuntime`.
- `Endpoint`: When an Endpoint object is created and started, it performs two key registrations: - `Endpoint`: When an Endpoint object is created and started, it performs two key registrations:
- NATS Registration: The endpoint is registered with the NATS service group created during service creation. The endpoint is assigned a unique subject following the naming: `{namespace_name}.{service_name}.{endpoint_name}-{lease_id_hex}`. - NATS Registration: The endpoint is registered with the NATS service group created during service creation. The endpoint is assigned a unique subject following the naming: `{namespace_name}.{service_name}.{endpoint_name}-{lease_id_hex}`.
- etcd Registration: The endpoint information is stored in etcd at a path following the naming: `/services/{namespace}/{component}/{endpoint}-{lease_id}`. Note that the endpoints of different workers of the same type (i.e., two `VllmPrefillWorker`s in one deployment) share the same `Namespace`, `Component`, and `Endpoint` name. They are distinguished by their different primary `lease_id` of their `DistributedRuntime`. - etcd Registration: The endpoint information is stored in etcd at a path following the naming: `/services/{namespace}/{component}/{endpoint}-{lease_id}`. Note that the endpoints of different workers of the same type (i.e., two `VllmPrefillWorker`s in one deployment) share the same `Namespace`, `Component`, and `Endpoint` name. They are distinguished by their different primary `lease_id` of their `DistributedRuntime`.
......
...@@ -27,7 +27,6 @@ from dynamo.runtime import DistributedRuntime, dynamo_worker ...@@ -27,7 +27,6 @@ from dynamo.runtime import DistributedRuntime, dynamo_worker
# 2. Register ourselves on the network # 2. Register ourselves on the network
# #
component = runtime.namespace("namespace").component("component") component = runtime.namespace("namespace").component("component")
await component.create_service()
model_path = "Qwen/Qwen3-0.6B" # or "/data/models/Qwen3-0.6B" model_path = "Qwen/Qwen3-0.6B" # or "/data/models/Qwen3-0.6B"
model_input = ModelInput.Tokens # or ModelInput.Text if engine handles pre-processing model_input = ModelInput.Tokens # or ModelInput.Text if engine handles pre-processing
model_type = ModelType.Chat # or ModelType.Chat | ModelType.Completions if model can be deployed on chat and completions endpoints model_type = ModelType.Chat # or ModelType.Chat | ModelType.Completions if model can be deployed on chat and completions endpoints
......
...@@ -339,7 +339,6 @@ from dynamo.runtime import DistributedRuntime, dynamo_worker ...@@ -339,7 +339,6 @@ from dynamo.runtime import DistributedRuntime, dynamo_worker
# 2. Register ourselves on the network # 2. Register ourselves on the network
# #
component = runtime.namespace("namespace").component("component") component = runtime.namespace("namespace").component("component")
await component.create_service()
model_path = "Qwen/Qwen3-0.6B" # or "/data/models/Qwen3-0.6B" model_path = "Qwen/Qwen3-0.6B" # or "/data/models/Qwen3-0.6B"
model_input = ModelInput.Tokens # or ModelInput.Text if engine handles pre-processing model_input = ModelInput.Tokens # or ModelInput.Text if engine handles pre-processing
model_type = ModelType.Chat # or ModelType.Chat | ModelType.Completions if model can be deployed on chat and completions endpoints model_type = ModelType.Chat # or ModelType.Chat | ModelType.Completions if model can be deployed on chat and completions endpoints
......
...@@ -58,7 +58,6 @@ async def main(): ...@@ -58,7 +58,6 @@ async def main():
# Create middle server component # Create middle server component
component = runtime.namespace("demo").component("middle") component = runtime.namespace("demo").component("middle")
await component.create_service()
endpoint = component.endpoint("generate") endpoint = component.endpoint("generate")
......
...@@ -35,7 +35,6 @@ async def main(): ...@@ -35,7 +35,6 @@ async def main():
# Create server component # Create server component
component = runtime.namespace("demo").component("server") component = runtime.namespace("demo").component("server")
await component.create_service()
endpoint = component.endpoint("generate") endpoint = component.endpoint("generate")
handler = DemoServer() handler = DemoServer()
......
...@@ -28,7 +28,6 @@ async def worker(runtime: DistributedRuntime): ...@@ -28,7 +28,6 @@ async def worker(runtime: DistributedRuntime):
endpoint_name = "generate" endpoint_name = "generate"
component = runtime.namespace(namespace_name).component(component_name) component = runtime.namespace(namespace_name).component(component_name)
await component.create_service()
logger.info(f"Created service {namespace_name}/{component_name}") logger.info(f"Created service {namespace_name}/{component_name}")
......
...@@ -93,7 +93,6 @@ async def worker(runtime: DistributedRuntime): ...@@ -93,7 +93,6 @@ async def worker(runtime: DistributedRuntime):
namespace, comp_name, endpoint_name = parts namespace, comp_name, endpoint_name = parts
component = runtime.namespace(namespace).component(comp_name) component = runtime.namespace(namespace).component(comp_name)
await component.create_service()
stats_endpoint = component.endpoint(endpoint_name) stats_endpoint = component.endpoint(endpoint_name)
print( print(
......
...@@ -269,7 +269,6 @@ async def init(runtime: DistributedRuntime, args: argparse.Namespace, config: Co ...@@ -269,7 +269,6 @@ async def init(runtime: DistributedRuntime, args: argparse.Namespace, config: Co
""" """
component = runtime.namespace(config.namespace).component(config.component) component = runtime.namespace(config.namespace).component(config.component)
await component.create_service()
generate_endpoint = component.endpoint(config.endpoint) generate_endpoint = component.endpoint(config.endpoint)
......
...@@ -226,7 +226,6 @@ async def init(runtime: DistributedRuntime, args: argparse.Namespace, config: Co ...@@ -226,7 +226,6 @@ async def init(runtime: DistributedRuntime, args: argparse.Namespace, config: Co
""" """
component = runtime.namespace(config.namespace).component(config.component) component = runtime.namespace(config.namespace).component(config.component)
await component.create_service()
generate_endpoint = component.endpoint(config.endpoint) generate_endpoint = component.endpoint(config.endpoint)
......
...@@ -303,7 +303,6 @@ async def init(runtime: DistributedRuntime, args: argparse.Namespace, config: Co ...@@ -303,7 +303,6 @@ async def init(runtime: DistributedRuntime, args: argparse.Namespace, config: Co
""" """
component = runtime.namespace(config.namespace).component(config.component) component = runtime.namespace(config.namespace).component(config.component)
await component.create_service()
generate_endpoint = component.endpoint(config.endpoint) generate_endpoint = component.endpoint(config.endpoint)
......
...@@ -273,7 +273,6 @@ async def init(runtime: DistributedRuntime, args: argparse.Namespace, config: Co ...@@ -273,7 +273,6 @@ async def init(runtime: DistributedRuntime, args: argparse.Namespace, config: Co
""" """
component = runtime.namespace(config.namespace).component(config.component) component = runtime.namespace(config.namespace).component(config.component)
await component.create_service()
generate_endpoint = component.endpoint(config.endpoint) generate_endpoint = component.endpoint(config.endpoint)
......
...@@ -437,7 +437,6 @@ async def init(runtime: DistributedRuntime, args: argparse.Namespace, config: Co ...@@ -437,7 +437,6 @@ async def init(runtime: DistributedRuntime, args: argparse.Namespace, config: Co
""" """
component = runtime.namespace(config.namespace).component(config.component) component = runtime.namespace(config.namespace).component(config.component)
await component.create_service()
generate_endpoint = component.endpoint(config.endpoint) generate_endpoint = component.endpoint(config.endpoint)
clear_endpoint = component.endpoint("clear_kv_blocks") clear_endpoint = component.endpoint("clear_kv_blocks")
......
...@@ -81,7 +81,12 @@ pub async fn run( ...@@ -81,7 +81,12 @@ pub async fn run(
let request_plane: RequestPlaneMode = flags.request_plane.parse()?; let request_plane: RequestPlaneMode = flags.request_plane.parse()?;
let dst_config = DistributedConfig { let dst_config = DistributedConfig {
store_backend: selected_store, store_backend: selected_store,
nats_config: nats::ClientOptions::default(), // We only need NATS here to monitor it's metrics, so only if it's our request plane.
nats_config: if request_plane.is_nats() {
Some(nats::ClientOptions::default())
} else {
None
},
request_plane, request_plane,
}; };
let distributed_runtime = DistributedRuntime::new(runtime.clone(), dst_config).await?; let distributed_runtime = DistributedRuntime::new(runtime.clone(), dst_config).await?;
......
...@@ -32,7 +32,6 @@ class RequestHandler: ...@@ -32,7 +32,6 @@ class RequestHandler:
@dynamo_worker() @dynamo_worker()
async def worker(runtime: DistributedRuntime): async def worker(runtime: DistributedRuntime):
component = runtime.namespace("examples/bls").component("bar") component = runtime.namespace("examples/bls").component("bar")
await component.create_service()
endpoint = component.endpoint("generate") endpoint = component.endpoint("generate")
await endpoint.serve_endpoint(RequestHandler().generate) await endpoint.serve_endpoint(RequestHandler().generate)
......
...@@ -31,7 +31,6 @@ class RequestHandler: ...@@ -31,7 +31,6 @@ class RequestHandler:
@dynamo_worker() @dynamo_worker()
async def worker(runtime: DistributedRuntime): async def worker(runtime: DistributedRuntime):
component = runtime.namespace("examples/bls").component("foo") component = runtime.namespace("examples/bls").component("foo")
await component.create_service()
endpoint = component.endpoint("generate") endpoint = component.endpoint("generate")
await endpoint.serve_endpoint(RequestHandler().generate) await endpoint.serve_endpoint(RequestHandler().generate)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment