Unverified Commit 42cde264 authored by jh-nv's avatar jh-nv Committed by GitHub
Browse files

fix: migrate example and document to the latest endpoint API (#6542)

parent 222c2e85
......@@ -58,9 +58,7 @@ from dynamollm import DistributedRuntime, KvRouter, KvRouterConfig
async def main():
# Get runtime and create endpoint
runtime = DistributedRuntime.detached()
namespace = runtime.namespace("dynamo")
component = namespace.component("backend")
endpoint = component.endpoint("generate")
endpoint = runtime.endpoint("dynamo.backend.generate")
# Create KV router
kv_router_config = KvRouterConfig()
......@@ -227,9 +225,7 @@ from dynamo.llm import DistributedRuntime, KvRouter, KvRouterConfig
async def minimize_ttft_routing():
# Setup router
runtime = DistributedRuntime.detached()
namespace = runtime.namespace("dynamo")
component = namespace.component("backend")
endpoint = component.endpoint("generate")
endpoint = runtime.endpoint("dynamo.backend.generate")
router = KvRouter(
endpoint=endpoint,
......
......@@ -211,7 +211,7 @@ Dynamo supports several routing strategies when sending requests from one compon
First, we must create a client tied to a components endpoint, we can do this using the labels defined above. Here we are getting a client tied to the `generate` endpoint of the `VllmWorker` component.
```python
client = namespace('dynamo').component('VllmWorker').endpoint('generate').client()
client = runtime.endpoint("dynamo.VllmWorker.generate").client()
```
We can then use the default routing methods exposed by the client class to send requests to the `VllmWorker` component.
......@@ -292,7 +292,7 @@ When both workers are registered, requests are automatically routed.
```python
# Decode worker registration (in your decode worker)
decode_endpoint = runtime.namespace("dynamo").component("decode").endpoint("generate")
decode_endpoint = runtime.endpoint("dynamo.decode.generate")
await register_model(
model_input=ModelInput.Tokens,
......@@ -305,7 +305,7 @@ await register_model(
await decode_endpoint.serve_endpoint(decode_handler.generate)
# Prefill worker registration (in your prefill worker)
prefill_endpoint = runtime.namespace("dynamo").component("prefill").endpoint("generate")
prefill_endpoint = runtime.endpoint("dynamo.prefill.generate")
await register_model(
model_input=ModelInput.Tokens,
......
......@@ -28,7 +28,7 @@ Since these components are deployed in different processes, each has its own `Di
- Worker components register with names like `backend`, `prefill`, `decode`, or `encoder` depending on their role
- Workers register endpoints like `generate`, `clear_kv_blocks`, or `load_metrics`
Their `DistributedRuntime`s are initialized in their respective main functions, their `Namespace`s are configured in the deployment YAML, their `Component`s are created programmatically (e.g., `runtime.namespace("dynamo").component("backend")`), and their `Endpoint`s are created using the `component.endpoint()` method.
Their `DistributedRuntime`s are initialized in their respective main functions, their `Namespace`s are configured in the deployment YAML, and their `Endpoint`s are obtained by path. In Python, use `runtime.endpoint("namespace.component.endpoint")` (e.g., `runtime.endpoint("dynamo.backend.generate")`).
## Initialization
......
......@@ -27,11 +27,10 @@ from dynamo.runtime import DistributedRuntime, dynamo_worker
# 2. Register ourselves on the network
#
component = runtime.namespace("namespace").component("component")
endpoint = runtime.endpoint("namespace.component.endpoint")
model_path = "Qwen/Qwen3-0.6B" # or "/data/models/Qwen3-0.6B"
model_input = ModelInput.Tokens # or ModelInput.Text if engine handles pre-processing
model_type = ModelType.Chat # or ModelType.Chat | ModelType.Completions if model can be deployed on chat and completions endpoints
endpoint = component.endpoint("endpoint")
# Optional last param to register_model is model_name. If not present derives it from model_path
await register_model(model_input, model_type, endpoint, model_path)
......
......@@ -133,10 +133,8 @@ async def worker(runtime: DistributedRuntime) -> None:
)
# Connect to downstream TRT-LLM workers
downstream_endpoint = (
runtime.namespace(args.namespace)
.component(args.downstream_component)
.endpoint(args.downstream_endpoint)
downstream_endpoint = runtime.endpoint(
f"{args.namespace}.{args.downstream_component}.{args.downstream_endpoint}"
)
downstream_client = await downstream_endpoint.client()
......@@ -175,8 +173,7 @@ async def worker(runtime: DistributedRuntime) -> None:
)
# Register this worker's endpoint
component = runtime.namespace(args.namespace).component(args.component)
endpoint = component.endpoint(args.endpoint)
endpoint = runtime.endpoint(f"{args.namespace}.{args.component}.{args.endpoint}")
# Use ModelInput.Tokens so Frontend preprocesses the request
# Request format: {token_ids, sampling_options, stop_conditions, extra_args: {messages}}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment