Unverified Commit 42cde264 authored by jh-nv's avatar jh-nv Committed by GitHub
Browse files

fix: migrate example and document to the latest endpoint API (#6542)

parent 222c2e85
...@@ -58,9 +58,7 @@ from dynamollm import DistributedRuntime, KvRouter, KvRouterConfig ...@@ -58,9 +58,7 @@ from dynamollm import DistributedRuntime, KvRouter, KvRouterConfig
async def main(): async def main():
# Get runtime and create endpoint # Get runtime and create endpoint
runtime = DistributedRuntime.detached() runtime = DistributedRuntime.detached()
namespace = runtime.namespace("dynamo") endpoint = runtime.endpoint("dynamo.backend.generate")
component = namespace.component("backend")
endpoint = component.endpoint("generate")
# Create KV router # Create KV router
kv_router_config = KvRouterConfig() kv_router_config = KvRouterConfig()
...@@ -227,9 +225,7 @@ from dynamo.llm import DistributedRuntime, KvRouter, KvRouterConfig ...@@ -227,9 +225,7 @@ from dynamo.llm import DistributedRuntime, KvRouter, KvRouterConfig
async def minimize_ttft_routing(): async def minimize_ttft_routing():
# Setup router # Setup router
runtime = DistributedRuntime.detached() runtime = DistributedRuntime.detached()
namespace = runtime.namespace("dynamo") endpoint = runtime.endpoint("dynamo.backend.generate")
component = namespace.component("backend")
endpoint = component.endpoint("generate")
router = KvRouter( router = KvRouter(
endpoint=endpoint, endpoint=endpoint,
......
...@@ -211,7 +211,7 @@ Dynamo supports several routing strategies when sending requests from one compon ...@@ -211,7 +211,7 @@ Dynamo supports several routing strategies when sending requests from one compon
First, we must create a client tied to a components endpoint, we can do this using the labels defined above. Here we are getting a client tied to the `generate` endpoint of the `VllmWorker` component. First, we must create a client tied to a components endpoint, we can do this using the labels defined above. Here we are getting a client tied to the `generate` endpoint of the `VllmWorker` component.
```python ```python
client = namespace('dynamo').component('VllmWorker').endpoint('generate').client() client = runtime.endpoint("dynamo.VllmWorker.generate").client()
``` ```
We can then use the default routing methods exposed by the client class to send requests to the `VllmWorker` component. We can then use the default routing methods exposed by the client class to send requests to the `VllmWorker` component.
...@@ -292,7 +292,7 @@ When both workers are registered, requests are automatically routed. ...@@ -292,7 +292,7 @@ When both workers are registered, requests are automatically routed.
```python ```python
# Decode worker registration (in your decode worker) # Decode worker registration (in your decode worker)
decode_endpoint = runtime.namespace("dynamo").component("decode").endpoint("generate") decode_endpoint = runtime.endpoint("dynamo.decode.generate")
await register_model( await register_model(
model_input=ModelInput.Tokens, model_input=ModelInput.Tokens,
...@@ -305,7 +305,7 @@ await register_model( ...@@ -305,7 +305,7 @@ await register_model(
await decode_endpoint.serve_endpoint(decode_handler.generate) await decode_endpoint.serve_endpoint(decode_handler.generate)
# Prefill worker registration (in your prefill worker) # Prefill worker registration (in your prefill worker)
prefill_endpoint = runtime.namespace("dynamo").component("prefill").endpoint("generate") prefill_endpoint = runtime.endpoint("dynamo.prefill.generate")
await register_model( await register_model(
model_input=ModelInput.Tokens, model_input=ModelInput.Tokens,
......
...@@ -28,7 +28,7 @@ Since these components are deployed in different processes, each has its own `Di ...@@ -28,7 +28,7 @@ Since these components are deployed in different processes, each has its own `Di
- Worker components register with names like `backend`, `prefill`, `decode`, or `encoder` depending on their role - Worker components register with names like `backend`, `prefill`, `decode`, or `encoder` depending on their role
- Workers register endpoints like `generate`, `clear_kv_blocks`, or `load_metrics` - Workers register endpoints like `generate`, `clear_kv_blocks`, or `load_metrics`
Their `DistributedRuntime`s are initialized in their respective main functions, their `Namespace`s are configured in the deployment YAML, their `Component`s are created programmatically (e.g., `runtime.namespace("dynamo").component("backend")`), and their `Endpoint`s are created using the `component.endpoint()` method. Their `DistributedRuntime`s are initialized in their respective main functions, their `Namespace`s are configured in the deployment YAML, and their `Endpoint`s are obtained by path. In Python, use `runtime.endpoint("namespace.component.endpoint")` (e.g., `runtime.endpoint("dynamo.backend.generate")`).
## Initialization ## Initialization
......
...@@ -27,11 +27,10 @@ from dynamo.runtime import DistributedRuntime, dynamo_worker ...@@ -27,11 +27,10 @@ from dynamo.runtime import DistributedRuntime, dynamo_worker
# 2. Register ourselves on the network # 2. Register ourselves on the network
# #
component = runtime.namespace("namespace").component("component") endpoint = runtime.endpoint("namespace.component.endpoint")
model_path = "Qwen/Qwen3-0.6B" # or "/data/models/Qwen3-0.6B" model_path = "Qwen/Qwen3-0.6B" # or "/data/models/Qwen3-0.6B"
model_input = ModelInput.Tokens # or ModelInput.Text if engine handles pre-processing model_input = ModelInput.Tokens # or ModelInput.Text if engine handles pre-processing
model_type = ModelType.Chat # or ModelType.Chat | ModelType.Completions if model can be deployed on chat and completions endpoints model_type = ModelType.Chat # or ModelType.Chat | ModelType.Completions if model can be deployed on chat and completions endpoints
endpoint = component.endpoint("endpoint")
# Optional last param to register_model is model_name. If not present derives it from model_path # Optional last param to register_model is model_name. If not present derives it from model_path
await register_model(model_input, model_type, endpoint, model_path) await register_model(model_input, model_type, endpoint, model_path)
......
...@@ -133,10 +133,8 @@ async def worker(runtime: DistributedRuntime) -> None: ...@@ -133,10 +133,8 @@ async def worker(runtime: DistributedRuntime) -> None:
) )
# Connect to downstream TRT-LLM workers # Connect to downstream TRT-LLM workers
downstream_endpoint = ( downstream_endpoint = runtime.endpoint(
runtime.namespace(args.namespace) f"{args.namespace}.{args.downstream_component}.{args.downstream_endpoint}"
.component(args.downstream_component)
.endpoint(args.downstream_endpoint)
) )
downstream_client = await downstream_endpoint.client() downstream_client = await downstream_endpoint.client()
...@@ -175,8 +173,7 @@ async def worker(runtime: DistributedRuntime) -> None: ...@@ -175,8 +173,7 @@ async def worker(runtime: DistributedRuntime) -> None:
) )
# Register this worker's endpoint # Register this worker's endpoint
component = runtime.namespace(args.namespace).component(args.component) endpoint = runtime.endpoint(f"{args.namespace}.{args.component}.{args.endpoint}")
endpoint = component.endpoint(args.endpoint)
# Use ModelInput.Tokens so Frontend preprocesses the request # Use ModelInput.Tokens so Frontend preprocesses the request
# Request format: {token_ids, sampling_options, stop_conditions, extra_args: {messages}} # Request format: {token_ids, sampling_options, stop_conditions, extra_args: {messages}}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment