Unverified Commit 5624d144 authored by Tzu-Ling Kan's avatar Tzu-Ling Kan Committed by GitHub
Browse files

Rename fetch_llm to fetch_model (#6268)


Signed-off-by: default avatartzulingk@nvidia.com <tzulingk@nvidia.com>
parent 09b6ab2f
......@@ -5,7 +5,7 @@
#
# Start a frontend node. This runs:
# - OpenAI HTTP server.
# - Auto-discovery: Watches etcd for engine/worker registration (via `register_llm`).
# - Auto-discovery: Watches etcd for engine/worker registration (via `register_model`).
# - Pre-processor: Prompt templating and tokenization.
# - Router, defaulting to round-robin. Use --router-mode to switch (round-robin, random, kv, direct).
#
......
......@@ -32,7 +32,7 @@ from dynamo.llm import (
PythonAsyncEngine,
RouterConfig,
RouterMode,
fetch_llm,
fetch_model,
)
from dynamo.runtime import DistributedRuntime
......@@ -393,7 +393,7 @@ class EngineFactory:
source_path = mdc.source_path()
if not os.path.exists(source_path):
await fetch_llm(source_path, ignore_weights=True)
await fetch_model(source_path, ignore_weights=True)
tokenizer_mode = getattr(self.flags, "tokenizer_mode", None) or "auto"
config_format = getattr(self.flags, "config_format", None) or "auto"
......
......@@ -11,7 +11,7 @@ but internally routes requests to local routers in different namespaces based on
a grid-based pool selection strategy.
Key features:
- Registers as BOTH prefill AND decode worker via register_llm()
- Registers as BOTH prefill AND decode worker via register_model()
- Routes prefill requests based on (ISL, TTFT) to prefill pools
- Routes decode requests based on (context_length, ITL) to decode pools
- Connects to local routers in each pool's namespace
......@@ -24,7 +24,7 @@ import os
import uvloop
from dynamo.llm import ModelInput, ModelType, register_llm
from dynamo.llm import ModelInput, ModelType, register_model
from dynamo.runtime import DistributedRuntime, dynamo_worker
from dynamo.runtime.logging import configure_dynamo_logging
......@@ -122,7 +122,7 @@ async def worker(runtime: DistributedRuntime):
logger.info("Registering as prefill worker...")
# Register as prefill worker - frontend will send prefill requests here
# Use model_name as model_path since we don't need tokenizer/model files
await register_llm(
await register_model(
model_input=ModelInput.Tokens,
model_type=ModelType.Prefill,
endpoint=prefill_endpoint,
......@@ -135,7 +135,7 @@ async def worker(runtime: DistributedRuntime):
logger.info("Registering as decode worker...")
# Register as decode worker - frontend will send decode requests here
await register_llm(
await register_model(
model_input=ModelInput.Tokens,
model_type=ModelType.Chat | ModelType.Completions,
endpoint=decode_endpoint,
......
......@@ -16,7 +16,7 @@ import uvloop
os.environ.setdefault("DYN_COMPUTE_THREADS", "0")
from dynamo.llm import EngineType, EntrypointArgs, fetch_llm, make_engine, run_input
from dynamo.llm import EngineType, EntrypointArgs, fetch_model, make_engine, run_input
from dynamo.runtime import DistributedRuntime
from dynamo.runtime.logging import configure_dynamo_logging
......@@ -46,7 +46,7 @@ async def prefetch_model(model_path: str) -> None:
logger.info(f"Pre-fetching model from HuggingFace: {model_path}")
try:
local_path = await fetch_llm(model_path, ignore_weights=True)
local_path = await fetch_model(model_path, ignore_weights=True)
logger.info(f"Model cached at: {local_path}")
except Exception as e:
logger.warning(
......
......@@ -21,7 +21,7 @@ from sglang.srt.server_args_config_parser import ConfigArgumentMerger
from dynamo._core import get_reasoning_parser_names, get_tool_parser_names
from dynamo.common.config_dump import register_encoder
from dynamo.common.utils.runtime import parse_endpoint
from dynamo.llm import fetch_llm
from dynamo.llm import fetch_model
from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.sglang import __version__
......@@ -507,17 +507,17 @@ async def parse_args(args: list[str]) -> Config:
if not parsed_args.served_model_name:
parsed_args.served_model_name = model_path
# Download the model if necessary using modelexpress.
# We don't set `parsed_args.model_path` to the local path fetch_llm returns
# We don't set `parsed_args.model_path` to the local path fetch_model returns
# because sglang will send this to its pipeline-parallel workers, which may
# not have the local path.
# sglang will attempt to download the model again, but find it in the HF cache.
# For non-HF models use a path instead of an HF name, and ensure all workers have
# that path (ideally via a shared folder).
if not os.path.exists(model_path):
await fetch_llm(model_path)
await fetch_model(model_path)
# TODO: sglang downloads the model in `from_cli_args`, which means we had to
# fetch_llm (download the model) here, in `parse_args`. `parse_args` should not
# fetch_model (download the model) here, in `parse_args`. `parse_args` should not
# contain code to download a model, it should only parse the args.
# For diffusion/video workers, create a minimal dummy ServerArgs since diffusion
......
......@@ -32,7 +32,7 @@ from dynamo.sglang.health_check import (
from dynamo.sglang.publisher import DynamoSglangPublisher, setup_sgl_metrics
from dynamo.sglang.register import (
register_image_diffusion_model,
register_llm_with_readiness_gate,
register_model_with_readiness_gate,
register_video_generation_model,
)
from dynamo.sglang.request_handlers import (
......@@ -307,7 +307,7 @@ async def init(
metrics_labels=metrics_labels,
health_check_payload=health_check_payload,
),
register_llm_with_readiness_gate(
register_model_with_readiness_gate(
engine,
generate_endpoint,
server_args,
......@@ -385,7 +385,7 @@ async def init_prefill(
metrics_labels=metrics_labels,
health_check_payload=health_check_payload,
),
register_llm_with_readiness_gate(
register_model_with_readiness_gate(
engine,
generate_endpoint,
server_args,
......@@ -474,7 +474,7 @@ async def init_diffusion(
metrics_labels=metrics_labels,
health_check_payload=health_check_payload,
),
register_llm_with_readiness_gate(
register_model_with_readiness_gate(
engine,
generate_endpoint,
server_args,
......@@ -538,7 +538,7 @@ async def init_embedding(
metrics_labels=metrics_labels,
health_check_payload=health_check_payload,
),
register_llm_with_readiness_gate(
register_model_with_readiness_gate(
engine,
generate_endpoint,
server_args,
......@@ -766,7 +766,7 @@ async def init_multimodal_processor(
(prometheus_names.labels.MODEL_NAME, server_args.served_model_name),
],
),
register_llm_with_readiness_gate(
register_model_with_readiness_gate(
None, # engine
generate_endpoint,
server_args,
......
......@@ -11,11 +11,11 @@ from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import get_local_ip_auto
from dynamo._core import Endpoint
from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm
from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_model
from dynamo.sglang.args import DynamoArgs
async def _register_llm_with_runtime_config(
async def _register_model_with_runtime_config(
engine: sgl.Engine,
endpoint: Endpoint,
server_args: ServerArgs,
......@@ -49,7 +49,7 @@ async def _register_llm_with_runtime_config(
output_type = ModelType.Chat
try:
await register_llm(
await register_model(
input_type,
output_type,
endpoint,
......@@ -231,7 +231,7 @@ async def _get_runtime_config(
return runtime_config
async def register_llm_with_readiness_gate(
async def register_model_with_readiness_gate(
engine: sgl.Engine,
generate_endpoint: Endpoint,
server_args: ServerArgs,
......@@ -254,7 +254,7 @@ async def register_llm_with_readiness_gate(
Raises:
RuntimeError: If model registration fails.
"""
registration_success = await _register_llm_with_runtime_config(
registration_success = await _register_model_with_runtime_config(
engine,
generate_endpoint,
server_args,
......@@ -295,7 +295,7 @@ async def register_image_diffusion_model(
model_name = server_args.model_path
try:
await register_llm(
await register_model(
ModelInput.Text,
ModelType.Images,
endpoint,
......@@ -335,7 +335,7 @@ async def register_video_generation_model(
model_name = server_args.model_path
try:
await register_llm(
await register_model(
ModelInput.Text,
ModelType.Videos,
endpoint,
......
......@@ -42,7 +42,7 @@ from dynamo.llm import (
ModelInput,
ModelRuntimeConfig,
ModelType,
register_llm,
register_model,
)
from dynamo.runtime import DistributedRuntime
from dynamo.trtllm.constants import DisaggregationMode
......@@ -437,7 +437,7 @@ async def init_llm_worker(
# Encode workers do NOT register - they're internal workers only
# Prefill and decode workers register - frontend detects their role via ModelType
if config.disaggregation_mode != DisaggregationMode.ENCODE:
await register_llm(
await register_model(
model_input,
model_type,
endpoint,
......
......@@ -10,7 +10,7 @@ workers using diffusion models (Wan, Flux, Cosmos, etc.).
import asyncio
import logging
from dynamo.llm import ModelInput, ModelType, register_llm
from dynamo.llm import ModelInput, ModelType, register_model
from dynamo.runtime import DistributedRuntime
from dynamo.trtllm.utils.trtllm_utils import Config
......@@ -91,9 +91,8 @@ async def init_video_diffusion_worker(
logging.info(f"Registering model '{model_name}' with ModelType={model_type}")
# register_llm is a misnomer — it's actually Dynamo's generic model
# registration function and the video diffisuion model is not an llm
await register_llm(
# register_model is Dynamo's generic model registration function
await register_model(
ModelInput.Text,
model_type,
endpoint,
......
......@@ -31,8 +31,8 @@ from dynamo.llm import (
ModelInput,
ModelType,
lora_name_to_id,
register_llm,
unregister_llm,
register_model,
unregister_model,
)
from dynamo.runtime.logging import configure_dynamo_logging
......@@ -571,7 +571,7 @@ class BaseWorkerHandler(ABC):
}
# Publish with format: v1/mdc/dynamo/backend/generate/{instance_id}/{lora_slug}
await register_llm(
await register_model(
model_input=ModelInput.Tokens,
model_type=ModelType.Chat | ModelType.Completions,
endpoint=self.generate_endpoint,
......@@ -691,7 +691,7 @@ class BaseWorkerHandler(ABC):
f"Unregistering LoRA '{lora_name}' ModelDeploymentCard"
)
try:
await unregister_llm(
await unregister_model(
endpoint=self.generate_endpoint,
lora_name=lora_name,
)
......
......@@ -28,8 +28,8 @@ from dynamo.llm import (
ModelInput,
ModelRuntimeConfig,
ModelType,
fetch_llm,
register_llm,
fetch_model,
register_model,
)
# Optional imports for frontend decoding support
......@@ -113,14 +113,14 @@ async def worker():
# Download the model if necessary using modelexpress.
# We want it on disk before we start vllm to avoid downloading from HuggingFace.
#
# We don't set `config.engine_args.model` to the local path fetch_llm returns
# We don't set `config.engine_args.model` to the local path fetch_model returns
# because vllm will send that name to its Ray pipeline-parallel workers, which
# may not have the local path.
# vllm will attempt to download the model again, but find it in the HF cache.
# For non-HF models use a path instead of an HF name, and ensure all workers have
# that path (ideally via a shared folder).
if not os.path.exists(config.model):
await fetch_llm(config.model)
await fetch_model(config.model)
# CHECKPOINT MODE: Load engine BEFORE runtime creation
# This allows checkpointing GPU state before runtime connections are established
......@@ -517,7 +517,7 @@ async def register_vllm_model(
media_fetcher.timeout_ms(30000)
media_fetcher.allow_direct_port(True)
await register_llm(
await register_model(
model_input,
model_type,
generate_endpoint,
......@@ -953,7 +953,7 @@ async def init_multimodal_processor(
await encode_worker_client.wait_for_instances()
# Register the endpoint as entrypoint to a model
await register_llm(
await register_model(
ModelInput.Tokens,
ModelType.Chat,
generate_endpoint,
......@@ -1141,7 +1141,7 @@ async def init_ec_processor(
await pd_client.wait_for_instances()
# Register the endpoint as entrypoint to a model (same as preprocessed_handler)
await register_llm(
await register_model(
ModelInput.Tokens, # Use Rust tokenization for better performance and multi-image support
ModelType.Chat,
generate_endpoint,
......@@ -1324,7 +1324,7 @@ async def init_omni(
return
# TODO: extend for multi-stage pipelines
await register_llm(
await register_model(
ModelInput.Text,
ModelType.Images,
generate_endpoint,
......
......@@ -34,7 +34,7 @@ The Dynamo Frontend is the API gateway for serving LLM inference requests. It pr
python -m dynamo.frontend --http-port 8000
```
This starts an OpenAI-compatible HTTP server with integrated preprocessing and routing. Backends are auto-discovered when they call `register_llm`.
This starts an OpenAI-compatible HTTP server with integrated preprocessing and routing. Backends are auto-discovered when they call `register_model`.
### KServe gRPC Frontend
......
......@@ -57,7 +57,7 @@ Tune these values based on your workload. Connection window should accommodate `
## Registering a Backend
Similar to HTTP frontend, the registered backend will be auto-discovered and added to the frontend list of serving model. To register a backend, the same `register_llm()` API will be used. Currently the frontend support serving of the following model type and model input combination:
Similar to HTTP frontend, the registered backend will be auto-discovered and added to the frontend list of serving model. To register a backend, the same `register_model()` API will be used. Currently the frontend support serving of the following model type and model input combination:
* `ModelType::Completions` and `ModelInput::Text`: Combination for LLM backend that uses custom preprocessor
* `ModelType::Completions` and `ModelInput::Token`: Combination for LLM backend that uses Dynamo preprocessor (i.e. Dynamo vLLM / SGLang / TRTLLM backend)
......@@ -153,7 +153,7 @@ See [Router Documentation](../router/README.md) for routing configuration detail
### With Backends
Backends auto-register with the frontend when they call `register_llm()`. Supported backends:
Backends auto-register with the frontend when they call `register_model()`. Supported backends:
- [vLLM Backend](../../backends/vllm/README.md)
- [SGLang Backend](../../backends/sglang/README.md)
......
......@@ -23,7 +23,7 @@ This command:
- Exposes the service on port 8000 (configurable)
- Automatically handles all backend workers registered to the Dynamo endpoint
Backend workers register themselves using the `register_llm` API, after which the KV Router automatically tracks worker state and makes routing decisions based on KV cache overlap.
Backend workers register themselves using the `register_model` API, after which the KV Router automatically tracks worker state and makes routing decisions based on KV cache overlap.
#### CLI Arguments
......@@ -83,8 +83,8 @@ For more configuration options and tuning guidelines, see the [Router Guide](rou
## Prerequisites and Limitations
**Requirements:**
- **Dynamic endpoints only**: KV router requires `register_llm()` with `model_input=ModelInput.Tokens`. Your backend handler receives pre-tokenized requests with `token_ids` instead of raw text.
- Backend workers must call `register_llm()` with `model_input=ModelInput.Tokens` (see [Backend Guide](../../development/backend-guide.md))
- **Dynamic endpoints only**: KV router requires `register_model()` with `model_input=ModelInput.Tokens`. Your backend handler receives pre-tokenized requests with `token_ids` instead of raw text.
- Backend workers must call `register_model()` with `model_input=ModelInput.Tokens` (see [Backend Guide](../../development/backend-guide.md))
- You cannot use `--static-endpoint` mode with KV routing (use dynamic discovery instead)
**Multimodal Support:**
......
......@@ -377,11 +377,11 @@ class CustomEnginePublisher:
#### Integration with Your Engine
```python
from dynamo.llm import register_llm
from dynamo.llm import register_model
async def main():
# Register your engine with Dynamo
component, endpoint = await register_llm(
component, endpoint = await register_model(
model="my-model",
generator=my_generate_fn,
)
......
......@@ -27,7 +27,7 @@ This command:
- Exposes the service on port 8000 (configurable)
- Automatically handles all backend workers registered to the Dynamo endpoint
Backend workers register themselves using the `register_llm` API, after which the KV Router automatically tracks worker state and makes routing decisions based on KV cache overlap.
Backend workers register themselves using the `register_model` API, after which the KV Router automatically tracks worker state and makes routing decisions based on KV cache overlap.
#### CLI Arguments
......@@ -267,7 +267,7 @@ Dynamo supports disaggregated serving where prefill (prompt processing) and deco
### Automatic Prefill Router Activation
The prefill router is automatically created when:
1. A decode model is registered (e.g., via `register_llm()` with `ModelType.Chat | ModelType.Completions`)
1. A decode model is registered (e.g., via `register_model()` with `ModelType.Chat | ModelType.Completions`)
2. A prefill worker is detected with the same model name and `ModelType.Prefill`
**Key characteristics of the prefill router:**
......@@ -283,7 +283,7 @@ When both workers are registered, requests are automatically routed.
# Decode worker registration (in your decode worker)
decode_endpoint = runtime.namespace("dynamo").component("decode").endpoint("generate")
await register_llm(
await register_model(
model_input=ModelInput.Tokens,
model_type=ModelType.Chat | ModelType.Completions,
endpoint=decode_endpoint,
......@@ -296,7 +296,7 @@ await decode_endpoint.serve_endpoint(decode_handler.generate)
# Prefill worker registration (in your prefill worker)
prefill_endpoint = runtime.namespace("dynamo").component("prefill").endpoint("generate")
await register_llm(
await register_model(
model_input=ModelInput.Tokens,
model_type=ModelType.Prefill, # <-- Mark as prefill worker
endpoint=prefill_endpoint,
......
......@@ -17,7 +17,7 @@ The Python file must do three things:
3. Attach a request handler
```
from dynamo.llm import ModelInput, ModelType, register_llm
from dynamo.llm import ModelInput, ModelType, register_model
from dynamo.runtime import DistributedRuntime, dynamo_worker
# 1. Decorate a function to get the runtime
......@@ -32,8 +32,8 @@ from dynamo.runtime import DistributedRuntime, dynamo_worker
model_input = ModelInput.Tokens # or ModelInput.Text if engine handles pre-processing
model_type = ModelType.Chat # or ModelType.Chat | ModelType.Completions if model can be deployed on chat and completions endpoints
endpoint = component.endpoint("endpoint")
# Optional last param to register_llm is model_name. If not present derives it from model_path
await register_llm(model_input, model_type, endpoint, model_path)
# Optional last param to register_model is model_name. If not present derives it from model_path
await register_model(model_input, model_type, endpoint, model_path)
# Initialize your engine here
# engine = ...
......@@ -70,7 +70,7 @@ The `model_type` can be:
- ModelType.Chat. Your `generate` method receives a `request` and must return a response dict of type [OpenAI Chat Completion](https://platform.openai.com/docs/api-reference/chat).
- ModelType.Completions. Your `generate` method receives a `request` and must return a response dict of the older [Completions](https://platform.openai.com/docs/api-reference/completions).
`register_llm` can also take the following kwargs:
`register_model` can also take the following kwargs:
- `model_name`: The name to call the model. Your incoming HTTP requests model name must match this. Defaults to the hugging face repo name or the folder name.
- `context_length`: Max model length in tokens. Defaults to the model's set max. Only set this if you need to reduce KV cache allocation to fit into VRAM.
- `kv_cache_block_size`: Size of a KV block for the engine, in tokens. Defaults to 16.
......
......@@ -410,7 +410,7 @@ TRT-LLM workers register with Dynamo using:
```python
# TRT-LLM Worker - Register with Tokens
await register_llm(
await register_model(
ModelInput.Tokens, # Rust does minimal preprocessing
model_type, # ModelType.Chat or ModelType.Prefill
generate_endpoint,
......
......@@ -465,7 +465,7 @@ Dynamo's Rust SDK supports two input types that determine how the HTTP frontend
```python
# Processor - Entry point from HTTP frontend
await register_llm(
await register_model(
ModelInput.Text, # Frontend sends raw text
ModelType.Chat,
generate_endpoint,
......@@ -474,7 +474,7 @@ await register_llm(
)
# Workers - Internal components
await register_llm(
await register_model(
ModelInput.Tokens, # Expect pre-tokenized input
ModelType.Chat, # or ModelType.Prefill for prefill workers
generate_endpoint,
......
......@@ -115,11 +115,11 @@ class CustomEnginePublisher:
### Integration with Your Engine
```python
from dynamo.llm import register_llm
from dynamo.llm import register_model
async def main():
# Register your engine with Dynamo
component, endpoint = await register_llm(
component, endpoint = await register_model(
model="my-model",
generator=my_generate_fn,
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment