Unverified Commit 5624d144 authored by Tzu-Ling Kan's avatar Tzu-Ling Kan Committed by GitHub
Browse files

Rename fetch_llm to fetch_model (#6268)


Signed-off-by: default avatartzulingk@nvidia.com <tzulingk@nvidia.com>
parent 09b6ab2f
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
# #
# Start a frontend node. This runs: # Start a frontend node. This runs:
# - OpenAI HTTP server. # - OpenAI HTTP server.
# - Auto-discovery: Watches etcd for engine/worker registration (via `register_llm`). # - Auto-discovery: Watches etcd for engine/worker registration (via `register_model`).
# - Pre-processor: Prompt templating and tokenization. # - Pre-processor: Prompt templating and tokenization.
# - Router, defaulting to round-robin. Use --router-mode to switch (round-robin, random, kv, direct). # - Router, defaulting to round-robin. Use --router-mode to switch (round-robin, random, kv, direct).
# #
......
...@@ -32,7 +32,7 @@ from dynamo.llm import ( ...@@ -32,7 +32,7 @@ from dynamo.llm import (
PythonAsyncEngine, PythonAsyncEngine,
RouterConfig, RouterConfig,
RouterMode, RouterMode,
fetch_llm, fetch_model,
) )
from dynamo.runtime import DistributedRuntime from dynamo.runtime import DistributedRuntime
...@@ -393,7 +393,7 @@ class EngineFactory: ...@@ -393,7 +393,7 @@ class EngineFactory:
source_path = mdc.source_path() source_path = mdc.source_path()
if not os.path.exists(source_path): if not os.path.exists(source_path):
await fetch_llm(source_path, ignore_weights=True) await fetch_model(source_path, ignore_weights=True)
tokenizer_mode = getattr(self.flags, "tokenizer_mode", None) or "auto" tokenizer_mode = getattr(self.flags, "tokenizer_mode", None) or "auto"
config_format = getattr(self.flags, "config_format", None) or "auto" config_format = getattr(self.flags, "config_format", None) or "auto"
......
...@@ -11,7 +11,7 @@ but internally routes requests to local routers in different namespaces based on ...@@ -11,7 +11,7 @@ but internally routes requests to local routers in different namespaces based on
a grid-based pool selection strategy. a grid-based pool selection strategy.
Key features: Key features:
- Registers as BOTH prefill AND decode worker via register_llm() - Registers as BOTH prefill AND decode worker via register_model()
- Routes prefill requests based on (ISL, TTFT) to prefill pools - Routes prefill requests based on (ISL, TTFT) to prefill pools
- Routes decode requests based on (context_length, ITL) to decode pools - Routes decode requests based on (context_length, ITL) to decode pools
- Connects to local routers in each pool's namespace - Connects to local routers in each pool's namespace
...@@ -24,7 +24,7 @@ import os ...@@ -24,7 +24,7 @@ import os
import uvloop import uvloop
from dynamo.llm import ModelInput, ModelType, register_llm from dynamo.llm import ModelInput, ModelType, register_model
from dynamo.runtime import DistributedRuntime, dynamo_worker from dynamo.runtime import DistributedRuntime, dynamo_worker
from dynamo.runtime.logging import configure_dynamo_logging from dynamo.runtime.logging import configure_dynamo_logging
...@@ -122,7 +122,7 @@ async def worker(runtime: DistributedRuntime): ...@@ -122,7 +122,7 @@ async def worker(runtime: DistributedRuntime):
logger.info("Registering as prefill worker...") logger.info("Registering as prefill worker...")
# Register as prefill worker - frontend will send prefill requests here # Register as prefill worker - frontend will send prefill requests here
# Use model_name as model_path since we don't need tokenizer/model files # Use model_name as model_path since we don't need tokenizer/model files
await register_llm( await register_model(
model_input=ModelInput.Tokens, model_input=ModelInput.Tokens,
model_type=ModelType.Prefill, model_type=ModelType.Prefill,
endpoint=prefill_endpoint, endpoint=prefill_endpoint,
...@@ -135,7 +135,7 @@ async def worker(runtime: DistributedRuntime): ...@@ -135,7 +135,7 @@ async def worker(runtime: DistributedRuntime):
logger.info("Registering as decode worker...") logger.info("Registering as decode worker...")
# Register as decode worker - frontend will send decode requests here # Register as decode worker - frontend will send decode requests here
await register_llm( await register_model(
model_input=ModelInput.Tokens, model_input=ModelInput.Tokens,
model_type=ModelType.Chat | ModelType.Completions, model_type=ModelType.Chat | ModelType.Completions,
endpoint=decode_endpoint, endpoint=decode_endpoint,
......
...@@ -16,7 +16,7 @@ import uvloop ...@@ -16,7 +16,7 @@ import uvloop
os.environ.setdefault("DYN_COMPUTE_THREADS", "0") os.environ.setdefault("DYN_COMPUTE_THREADS", "0")
from dynamo.llm import EngineType, EntrypointArgs, fetch_llm, make_engine, run_input from dynamo.llm import EngineType, EntrypointArgs, fetch_model, make_engine, run_input
from dynamo.runtime import DistributedRuntime from dynamo.runtime import DistributedRuntime
from dynamo.runtime.logging import configure_dynamo_logging from dynamo.runtime.logging import configure_dynamo_logging
...@@ -46,7 +46,7 @@ async def prefetch_model(model_path: str) -> None: ...@@ -46,7 +46,7 @@ async def prefetch_model(model_path: str) -> None:
logger.info(f"Pre-fetching model from HuggingFace: {model_path}") logger.info(f"Pre-fetching model from HuggingFace: {model_path}")
try: try:
local_path = await fetch_llm(model_path, ignore_weights=True) local_path = await fetch_model(model_path, ignore_weights=True)
logger.info(f"Model cached at: {local_path}") logger.info(f"Model cached at: {local_path}")
except Exception as e: except Exception as e:
logger.warning( logger.warning(
......
...@@ -21,7 +21,7 @@ from sglang.srt.server_args_config_parser import ConfigArgumentMerger ...@@ -21,7 +21,7 @@ from sglang.srt.server_args_config_parser import ConfigArgumentMerger
from dynamo._core import get_reasoning_parser_names, get_tool_parser_names from dynamo._core import get_reasoning_parser_names, get_tool_parser_names
from dynamo.common.config_dump import register_encoder from dynamo.common.config_dump import register_encoder
from dynamo.common.utils.runtime import parse_endpoint from dynamo.common.utils.runtime import parse_endpoint
from dynamo.llm import fetch_llm from dynamo.llm import fetch_model
from dynamo.runtime.logging import configure_dynamo_logging from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.sglang import __version__ from dynamo.sglang import __version__
...@@ -507,17 +507,17 @@ async def parse_args(args: list[str]) -> Config: ...@@ -507,17 +507,17 @@ async def parse_args(args: list[str]) -> Config:
if not parsed_args.served_model_name: if not parsed_args.served_model_name:
parsed_args.served_model_name = model_path parsed_args.served_model_name = model_path
# Download the model if necessary using modelexpress. # Download the model if necessary using modelexpress.
# We don't set `parsed_args.model_path` to the local path fetch_llm returns # We don't set `parsed_args.model_path` to the local path fetch_model returns
# because sglang will send this to its pipeline-parallel workers, which may # because sglang will send this to its pipeline-parallel workers, which may
# not have the local path. # not have the local path.
# sglang will attempt to download the model again, but find it in the HF cache. # sglang will attempt to download the model again, but find it in the HF cache.
# For non-HF models use a path instead of an HF name, and ensure all workers have # For non-HF models use a path instead of an HF name, and ensure all workers have
# that path (ideally via a shared folder). # that path (ideally via a shared folder).
if not os.path.exists(model_path): if not os.path.exists(model_path):
await fetch_llm(model_path) await fetch_model(model_path)
# TODO: sglang downloads the model in `from_cli_args`, which means we had to # TODO: sglang downloads the model in `from_cli_args`, which means we had to
# fetch_llm (download the model) here, in `parse_args`. `parse_args` should not # fetch_model (download the model) here, in `parse_args`. `parse_args` should not
# contain code to download a model, it should only parse the args. # contain code to download a model, it should only parse the args.
# For diffusion/video workers, create a minimal dummy ServerArgs since diffusion # For diffusion/video workers, create a minimal dummy ServerArgs since diffusion
......
...@@ -32,7 +32,7 @@ from dynamo.sglang.health_check import ( ...@@ -32,7 +32,7 @@ from dynamo.sglang.health_check import (
from dynamo.sglang.publisher import DynamoSglangPublisher, setup_sgl_metrics from dynamo.sglang.publisher import DynamoSglangPublisher, setup_sgl_metrics
from dynamo.sglang.register import ( from dynamo.sglang.register import (
register_image_diffusion_model, register_image_diffusion_model,
register_llm_with_readiness_gate, register_model_with_readiness_gate,
register_video_generation_model, register_video_generation_model,
) )
from dynamo.sglang.request_handlers import ( from dynamo.sglang.request_handlers import (
...@@ -307,7 +307,7 @@ async def init( ...@@ -307,7 +307,7 @@ async def init(
metrics_labels=metrics_labels, metrics_labels=metrics_labels,
health_check_payload=health_check_payload, health_check_payload=health_check_payload,
), ),
register_llm_with_readiness_gate( register_model_with_readiness_gate(
engine, engine,
generate_endpoint, generate_endpoint,
server_args, server_args,
...@@ -385,7 +385,7 @@ async def init_prefill( ...@@ -385,7 +385,7 @@ async def init_prefill(
metrics_labels=metrics_labels, metrics_labels=metrics_labels,
health_check_payload=health_check_payload, health_check_payload=health_check_payload,
), ),
register_llm_with_readiness_gate( register_model_with_readiness_gate(
engine, engine,
generate_endpoint, generate_endpoint,
server_args, server_args,
...@@ -474,7 +474,7 @@ async def init_diffusion( ...@@ -474,7 +474,7 @@ async def init_diffusion(
metrics_labels=metrics_labels, metrics_labels=metrics_labels,
health_check_payload=health_check_payload, health_check_payload=health_check_payload,
), ),
register_llm_with_readiness_gate( register_model_with_readiness_gate(
engine, engine,
generate_endpoint, generate_endpoint,
server_args, server_args,
...@@ -538,7 +538,7 @@ async def init_embedding( ...@@ -538,7 +538,7 @@ async def init_embedding(
metrics_labels=metrics_labels, metrics_labels=metrics_labels,
health_check_payload=health_check_payload, health_check_payload=health_check_payload,
), ),
register_llm_with_readiness_gate( register_model_with_readiness_gate(
engine, engine,
generate_endpoint, generate_endpoint,
server_args, server_args,
...@@ -766,7 +766,7 @@ async def init_multimodal_processor( ...@@ -766,7 +766,7 @@ async def init_multimodal_processor(
(prometheus_names.labels.MODEL_NAME, server_args.served_model_name), (prometheus_names.labels.MODEL_NAME, server_args.served_model_name),
], ],
), ),
register_llm_with_readiness_gate( register_model_with_readiness_gate(
None, # engine None, # engine
generate_endpoint, generate_endpoint,
server_args, server_args,
......
...@@ -11,11 +11,11 @@ from sglang.srt.server_args import ServerArgs ...@@ -11,11 +11,11 @@ from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import get_local_ip_auto from sglang.srt.utils import get_local_ip_auto
from dynamo._core import Endpoint from dynamo._core import Endpoint
from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_model
from dynamo.sglang.args import DynamoArgs from dynamo.sglang.args import DynamoArgs
async def _register_llm_with_runtime_config( async def _register_model_with_runtime_config(
engine: sgl.Engine, engine: sgl.Engine,
endpoint: Endpoint, endpoint: Endpoint,
server_args: ServerArgs, server_args: ServerArgs,
...@@ -49,7 +49,7 @@ async def _register_llm_with_runtime_config( ...@@ -49,7 +49,7 @@ async def _register_llm_with_runtime_config(
output_type = ModelType.Chat output_type = ModelType.Chat
try: try:
await register_llm( await register_model(
input_type, input_type,
output_type, output_type,
endpoint, endpoint,
...@@ -231,7 +231,7 @@ async def _get_runtime_config( ...@@ -231,7 +231,7 @@ async def _get_runtime_config(
return runtime_config return runtime_config
async def register_llm_with_readiness_gate( async def register_model_with_readiness_gate(
engine: sgl.Engine, engine: sgl.Engine,
generate_endpoint: Endpoint, generate_endpoint: Endpoint,
server_args: ServerArgs, server_args: ServerArgs,
...@@ -254,7 +254,7 @@ async def register_llm_with_readiness_gate( ...@@ -254,7 +254,7 @@ async def register_llm_with_readiness_gate(
Raises: Raises:
RuntimeError: If model registration fails. RuntimeError: If model registration fails.
""" """
registration_success = await _register_llm_with_runtime_config( registration_success = await _register_model_with_runtime_config(
engine, engine,
generate_endpoint, generate_endpoint,
server_args, server_args,
...@@ -295,7 +295,7 @@ async def register_image_diffusion_model( ...@@ -295,7 +295,7 @@ async def register_image_diffusion_model(
model_name = server_args.model_path model_name = server_args.model_path
try: try:
await register_llm( await register_model(
ModelInput.Text, ModelInput.Text,
ModelType.Images, ModelType.Images,
endpoint, endpoint,
...@@ -335,7 +335,7 @@ async def register_video_generation_model( ...@@ -335,7 +335,7 @@ async def register_video_generation_model(
model_name = server_args.model_path model_name = server_args.model_path
try: try:
await register_llm( await register_model(
ModelInput.Text, ModelInput.Text,
ModelType.Videos, ModelType.Videos,
endpoint, endpoint,
......
...@@ -42,7 +42,7 @@ from dynamo.llm import ( ...@@ -42,7 +42,7 @@ from dynamo.llm import (
ModelInput, ModelInput,
ModelRuntimeConfig, ModelRuntimeConfig,
ModelType, ModelType,
register_llm, register_model,
) )
from dynamo.runtime import DistributedRuntime from dynamo.runtime import DistributedRuntime
from dynamo.trtllm.constants import DisaggregationMode from dynamo.trtllm.constants import DisaggregationMode
...@@ -437,7 +437,7 @@ async def init_llm_worker( ...@@ -437,7 +437,7 @@ async def init_llm_worker(
# Encode workers do NOT register - they're internal workers only # Encode workers do NOT register - they're internal workers only
# Prefill and decode workers register - frontend detects their role via ModelType # Prefill and decode workers register - frontend detects their role via ModelType
if config.disaggregation_mode != DisaggregationMode.ENCODE: if config.disaggregation_mode != DisaggregationMode.ENCODE:
await register_llm( await register_model(
model_input, model_input,
model_type, model_type,
endpoint, endpoint,
......
...@@ -10,7 +10,7 @@ workers using diffusion models (Wan, Flux, Cosmos, etc.). ...@@ -10,7 +10,7 @@ workers using diffusion models (Wan, Flux, Cosmos, etc.).
import asyncio import asyncio
import logging import logging
from dynamo.llm import ModelInput, ModelType, register_llm from dynamo.llm import ModelInput, ModelType, register_model
from dynamo.runtime import DistributedRuntime from dynamo.runtime import DistributedRuntime
from dynamo.trtllm.utils.trtllm_utils import Config from dynamo.trtllm.utils.trtllm_utils import Config
...@@ -91,9 +91,8 @@ async def init_video_diffusion_worker( ...@@ -91,9 +91,8 @@ async def init_video_diffusion_worker(
logging.info(f"Registering model '{model_name}' with ModelType={model_type}") logging.info(f"Registering model '{model_name}' with ModelType={model_type}")
# register_llm is a misnomer — it's actually Dynamo's generic model # register_model is Dynamo's generic model registration function
# registration function and the video diffisuion model is not an llm await register_model(
await register_llm(
ModelInput.Text, ModelInput.Text,
model_type, model_type,
endpoint, endpoint,
......
...@@ -31,8 +31,8 @@ from dynamo.llm import ( ...@@ -31,8 +31,8 @@ from dynamo.llm import (
ModelInput, ModelInput,
ModelType, ModelType,
lora_name_to_id, lora_name_to_id,
register_llm, register_model,
unregister_llm, unregister_model,
) )
from dynamo.runtime.logging import configure_dynamo_logging from dynamo.runtime.logging import configure_dynamo_logging
...@@ -571,7 +571,7 @@ class BaseWorkerHandler(ABC): ...@@ -571,7 +571,7 @@ class BaseWorkerHandler(ABC):
} }
# Publish with format: v1/mdc/dynamo/backend/generate/{instance_id}/{lora_slug} # Publish with format: v1/mdc/dynamo/backend/generate/{instance_id}/{lora_slug}
await register_llm( await register_model(
model_input=ModelInput.Tokens, model_input=ModelInput.Tokens,
model_type=ModelType.Chat | ModelType.Completions, model_type=ModelType.Chat | ModelType.Completions,
endpoint=self.generate_endpoint, endpoint=self.generate_endpoint,
...@@ -691,7 +691,7 @@ class BaseWorkerHandler(ABC): ...@@ -691,7 +691,7 @@ class BaseWorkerHandler(ABC):
f"Unregistering LoRA '{lora_name}' ModelDeploymentCard" f"Unregistering LoRA '{lora_name}' ModelDeploymentCard"
) )
try: try:
await unregister_llm( await unregister_model(
endpoint=self.generate_endpoint, endpoint=self.generate_endpoint,
lora_name=lora_name, lora_name=lora_name,
) )
......
...@@ -28,8 +28,8 @@ from dynamo.llm import ( ...@@ -28,8 +28,8 @@ from dynamo.llm import (
ModelInput, ModelInput,
ModelRuntimeConfig, ModelRuntimeConfig,
ModelType, ModelType,
fetch_llm, fetch_model,
register_llm, register_model,
) )
# Optional imports for frontend decoding support # Optional imports for frontend decoding support
...@@ -113,14 +113,14 @@ async def worker(): ...@@ -113,14 +113,14 @@ async def worker():
# Download the model if necessary using modelexpress. # Download the model if necessary using modelexpress.
# We want it on disk before we start vllm to avoid downloading from HuggingFace. # We want it on disk before we start vllm to avoid downloading from HuggingFace.
# #
# We don't set `config.engine_args.model` to the local path fetch_llm returns # We don't set `config.engine_args.model` to the local path fetch_model returns
# because vllm will send that name to its Ray pipeline-parallel workers, which # because vllm will send that name to its Ray pipeline-parallel workers, which
# may not have the local path. # may not have the local path.
# vllm will attempt to download the model again, but find it in the HF cache. # vllm will attempt to download the model again, but find it in the HF cache.
# For non-HF models use a path instead of an HF name, and ensure all workers have # For non-HF models use a path instead of an HF name, and ensure all workers have
# that path (ideally via a shared folder). # that path (ideally via a shared folder).
if not os.path.exists(config.model): if not os.path.exists(config.model):
await fetch_llm(config.model) await fetch_model(config.model)
# CHECKPOINT MODE: Load engine BEFORE runtime creation # CHECKPOINT MODE: Load engine BEFORE runtime creation
# This allows checkpointing GPU state before runtime connections are established # This allows checkpointing GPU state before runtime connections are established
...@@ -517,7 +517,7 @@ async def register_vllm_model( ...@@ -517,7 +517,7 @@ async def register_vllm_model(
media_fetcher.timeout_ms(30000) media_fetcher.timeout_ms(30000)
media_fetcher.allow_direct_port(True) media_fetcher.allow_direct_port(True)
await register_llm( await register_model(
model_input, model_input,
model_type, model_type,
generate_endpoint, generate_endpoint,
...@@ -953,7 +953,7 @@ async def init_multimodal_processor( ...@@ -953,7 +953,7 @@ async def init_multimodal_processor(
await encode_worker_client.wait_for_instances() await encode_worker_client.wait_for_instances()
# Register the endpoint as entrypoint to a model # Register the endpoint as entrypoint to a model
await register_llm( await register_model(
ModelInput.Tokens, ModelInput.Tokens,
ModelType.Chat, ModelType.Chat,
generate_endpoint, generate_endpoint,
...@@ -1141,7 +1141,7 @@ async def init_ec_processor( ...@@ -1141,7 +1141,7 @@ async def init_ec_processor(
await pd_client.wait_for_instances() await pd_client.wait_for_instances()
# Register the endpoint as entrypoint to a model (same as preprocessed_handler) # Register the endpoint as entrypoint to a model (same as preprocessed_handler)
await register_llm( await register_model(
ModelInput.Tokens, # Use Rust tokenization for better performance and multi-image support ModelInput.Tokens, # Use Rust tokenization for better performance and multi-image support
ModelType.Chat, ModelType.Chat,
generate_endpoint, generate_endpoint,
...@@ -1324,7 +1324,7 @@ async def init_omni( ...@@ -1324,7 +1324,7 @@ async def init_omni(
return return
# TODO: extend for multi-stage pipelines # TODO: extend for multi-stage pipelines
await register_llm( await register_model(
ModelInput.Text, ModelInput.Text,
ModelType.Images, ModelType.Images,
generate_endpoint, generate_endpoint,
......
...@@ -34,7 +34,7 @@ The Dynamo Frontend is the API gateway for serving LLM inference requests. It pr ...@@ -34,7 +34,7 @@ The Dynamo Frontend is the API gateway for serving LLM inference requests. It pr
python -m dynamo.frontend --http-port 8000 python -m dynamo.frontend --http-port 8000
``` ```
This starts an OpenAI-compatible HTTP server with integrated preprocessing and routing. Backends are auto-discovered when they call `register_llm`. This starts an OpenAI-compatible HTTP server with integrated preprocessing and routing. Backends are auto-discovered when they call `register_model`.
### KServe gRPC Frontend ### KServe gRPC Frontend
......
...@@ -57,7 +57,7 @@ Tune these values based on your workload. Connection window should accommodate ` ...@@ -57,7 +57,7 @@ Tune these values based on your workload. Connection window should accommodate `
## Registering a Backend ## Registering a Backend
Similar to HTTP frontend, the registered backend will be auto-discovered and added to the frontend list of serving model. To register a backend, the same `register_llm()` API will be used. Currently the frontend support serving of the following model type and model input combination: Similar to HTTP frontend, the registered backend will be auto-discovered and added to the frontend list of serving model. To register a backend, the same `register_model()` API will be used. Currently the frontend support serving of the following model type and model input combination:
* `ModelType::Completions` and `ModelInput::Text`: Combination for LLM backend that uses custom preprocessor * `ModelType::Completions` and `ModelInput::Text`: Combination for LLM backend that uses custom preprocessor
* `ModelType::Completions` and `ModelInput::Token`: Combination for LLM backend that uses Dynamo preprocessor (i.e. Dynamo vLLM / SGLang / TRTLLM backend) * `ModelType::Completions` and `ModelInput::Token`: Combination for LLM backend that uses Dynamo preprocessor (i.e. Dynamo vLLM / SGLang / TRTLLM backend)
...@@ -153,7 +153,7 @@ See [Router Documentation](../router/README.md) for routing configuration detail ...@@ -153,7 +153,7 @@ See [Router Documentation](../router/README.md) for routing configuration detail
### With Backends ### With Backends
Backends auto-register with the frontend when they call `register_llm()`. Supported backends: Backends auto-register with the frontend when they call `register_model()`. Supported backends:
- [vLLM Backend](../../backends/vllm/README.md) - [vLLM Backend](../../backends/vllm/README.md)
- [SGLang Backend](../../backends/sglang/README.md) - [SGLang Backend](../../backends/sglang/README.md)
......
...@@ -23,7 +23,7 @@ This command: ...@@ -23,7 +23,7 @@ This command:
- Exposes the service on port 8000 (configurable) - Exposes the service on port 8000 (configurable)
- Automatically handles all backend workers registered to the Dynamo endpoint - Automatically handles all backend workers registered to the Dynamo endpoint
Backend workers register themselves using the `register_llm` API, after which the KV Router automatically tracks worker state and makes routing decisions based on KV cache overlap. Backend workers register themselves using the `register_model` API, after which the KV Router automatically tracks worker state and makes routing decisions based on KV cache overlap.
#### CLI Arguments #### CLI Arguments
...@@ -83,8 +83,8 @@ For more configuration options and tuning guidelines, see the [Router Guide](rou ...@@ -83,8 +83,8 @@ For more configuration options and tuning guidelines, see the [Router Guide](rou
## Prerequisites and Limitations ## Prerequisites and Limitations
**Requirements:** **Requirements:**
- **Dynamic endpoints only**: KV router requires `register_llm()` with `model_input=ModelInput.Tokens`. Your backend handler receives pre-tokenized requests with `token_ids` instead of raw text. - **Dynamic endpoints only**: KV router requires `register_model()` with `model_input=ModelInput.Tokens`. Your backend handler receives pre-tokenized requests with `token_ids` instead of raw text.
- Backend workers must call `register_llm()` with `model_input=ModelInput.Tokens` (see [Backend Guide](../../development/backend-guide.md)) - Backend workers must call `register_model()` with `model_input=ModelInput.Tokens` (see [Backend Guide](../../development/backend-guide.md))
- You cannot use `--static-endpoint` mode with KV routing (use dynamic discovery instead) - You cannot use `--static-endpoint` mode with KV routing (use dynamic discovery instead)
**Multimodal Support:** **Multimodal Support:**
......
...@@ -377,11 +377,11 @@ class CustomEnginePublisher: ...@@ -377,11 +377,11 @@ class CustomEnginePublisher:
#### Integration with Your Engine #### Integration with Your Engine
```python ```python
from dynamo.llm import register_llm from dynamo.llm import register_model
async def main(): async def main():
# Register your engine with Dynamo # Register your engine with Dynamo
component, endpoint = await register_llm( component, endpoint = await register_model(
model="my-model", model="my-model",
generator=my_generate_fn, generator=my_generate_fn,
) )
......
...@@ -27,7 +27,7 @@ This command: ...@@ -27,7 +27,7 @@ This command:
- Exposes the service on port 8000 (configurable) - Exposes the service on port 8000 (configurable)
- Automatically handles all backend workers registered to the Dynamo endpoint - Automatically handles all backend workers registered to the Dynamo endpoint
Backend workers register themselves using the `register_llm` API, after which the KV Router automatically tracks worker state and makes routing decisions based on KV cache overlap. Backend workers register themselves using the `register_model` API, after which the KV Router automatically tracks worker state and makes routing decisions based on KV cache overlap.
#### CLI Arguments #### CLI Arguments
...@@ -267,7 +267,7 @@ Dynamo supports disaggregated serving where prefill (prompt processing) and deco ...@@ -267,7 +267,7 @@ Dynamo supports disaggregated serving where prefill (prompt processing) and deco
### Automatic Prefill Router Activation ### Automatic Prefill Router Activation
The prefill router is automatically created when: The prefill router is automatically created when:
1. A decode model is registered (e.g., via `register_llm()` with `ModelType.Chat | ModelType.Completions`) 1. A decode model is registered (e.g., via `register_model()` with `ModelType.Chat | ModelType.Completions`)
2. A prefill worker is detected with the same model name and `ModelType.Prefill` 2. A prefill worker is detected with the same model name and `ModelType.Prefill`
**Key characteristics of the prefill router:** **Key characteristics of the prefill router:**
...@@ -283,7 +283,7 @@ When both workers are registered, requests are automatically routed. ...@@ -283,7 +283,7 @@ When both workers are registered, requests are automatically routed.
# Decode worker registration (in your decode worker) # Decode worker registration (in your decode worker)
decode_endpoint = runtime.namespace("dynamo").component("decode").endpoint("generate") decode_endpoint = runtime.namespace("dynamo").component("decode").endpoint("generate")
await register_llm( await register_model(
model_input=ModelInput.Tokens, model_input=ModelInput.Tokens,
model_type=ModelType.Chat | ModelType.Completions, model_type=ModelType.Chat | ModelType.Completions,
endpoint=decode_endpoint, endpoint=decode_endpoint,
...@@ -296,7 +296,7 @@ await decode_endpoint.serve_endpoint(decode_handler.generate) ...@@ -296,7 +296,7 @@ await decode_endpoint.serve_endpoint(decode_handler.generate)
# Prefill worker registration (in your prefill worker) # Prefill worker registration (in your prefill worker)
prefill_endpoint = runtime.namespace("dynamo").component("prefill").endpoint("generate") prefill_endpoint = runtime.namespace("dynamo").component("prefill").endpoint("generate")
await register_llm( await register_model(
model_input=ModelInput.Tokens, model_input=ModelInput.Tokens,
model_type=ModelType.Prefill, # <-- Mark as prefill worker model_type=ModelType.Prefill, # <-- Mark as prefill worker
endpoint=prefill_endpoint, endpoint=prefill_endpoint,
......
...@@ -17,7 +17,7 @@ The Python file must do three things: ...@@ -17,7 +17,7 @@ The Python file must do three things:
3. Attach a request handler 3. Attach a request handler
``` ```
from dynamo.llm import ModelInput, ModelType, register_llm from dynamo.llm import ModelInput, ModelType, register_model
from dynamo.runtime import DistributedRuntime, dynamo_worker from dynamo.runtime import DistributedRuntime, dynamo_worker
# 1. Decorate a function to get the runtime # 1. Decorate a function to get the runtime
...@@ -32,8 +32,8 @@ from dynamo.runtime import DistributedRuntime, dynamo_worker ...@@ -32,8 +32,8 @@ from dynamo.runtime import DistributedRuntime, dynamo_worker
model_input = ModelInput.Tokens # or ModelInput.Text if engine handles pre-processing model_input = ModelInput.Tokens # or ModelInput.Text if engine handles pre-processing
model_type = ModelType.Chat # or ModelType.Chat | ModelType.Completions if model can be deployed on chat and completions endpoints model_type = ModelType.Chat # or ModelType.Chat | ModelType.Completions if model can be deployed on chat and completions endpoints
endpoint = component.endpoint("endpoint") endpoint = component.endpoint("endpoint")
# Optional last param to register_llm is model_name. If not present derives it from model_path # Optional last param to register_model is model_name. If not present derives it from model_path
await register_llm(model_input, model_type, endpoint, model_path) await register_model(model_input, model_type, endpoint, model_path)
# Initialize your engine here # Initialize your engine here
# engine = ... # engine = ...
...@@ -70,7 +70,7 @@ The `model_type` can be: ...@@ -70,7 +70,7 @@ The `model_type` can be:
- ModelType.Chat. Your `generate` method receives a `request` and must return a response dict of type [OpenAI Chat Completion](https://platform.openai.com/docs/api-reference/chat). - ModelType.Chat. Your `generate` method receives a `request` and must return a response dict of type [OpenAI Chat Completion](https://platform.openai.com/docs/api-reference/chat).
- ModelType.Completions. Your `generate` method receives a `request` and must return a response dict of the older [Completions](https://platform.openai.com/docs/api-reference/completions). - ModelType.Completions. Your `generate` method receives a `request` and must return a response dict of the older [Completions](https://platform.openai.com/docs/api-reference/completions).
`register_llm` can also take the following kwargs: `register_model` can also take the following kwargs:
- `model_name`: The name to call the model. Your incoming HTTP requests model name must match this. Defaults to the hugging face repo name or the folder name. - `model_name`: The name to call the model. Your incoming HTTP requests model name must match this. Defaults to the hugging face repo name or the folder name.
- `context_length`: Max model length in tokens. Defaults to the model's set max. Only set this if you need to reduce KV cache allocation to fit into VRAM. - `context_length`: Max model length in tokens. Defaults to the model's set max. Only set this if you need to reduce KV cache allocation to fit into VRAM.
- `kv_cache_block_size`: Size of a KV block for the engine, in tokens. Defaults to 16. - `kv_cache_block_size`: Size of a KV block for the engine, in tokens. Defaults to 16.
......
...@@ -410,7 +410,7 @@ TRT-LLM workers register with Dynamo using: ...@@ -410,7 +410,7 @@ TRT-LLM workers register with Dynamo using:
```python ```python
# TRT-LLM Worker - Register with Tokens # TRT-LLM Worker - Register with Tokens
await register_llm( await register_model(
ModelInput.Tokens, # Rust does minimal preprocessing ModelInput.Tokens, # Rust does minimal preprocessing
model_type, # ModelType.Chat or ModelType.Prefill model_type, # ModelType.Chat or ModelType.Prefill
generate_endpoint, generate_endpoint,
......
...@@ -465,7 +465,7 @@ Dynamo's Rust SDK supports two input types that determine how the HTTP frontend ...@@ -465,7 +465,7 @@ Dynamo's Rust SDK supports two input types that determine how the HTTP frontend
```python ```python
# Processor - Entry point from HTTP frontend # Processor - Entry point from HTTP frontend
await register_llm( await register_model(
ModelInput.Text, # Frontend sends raw text ModelInput.Text, # Frontend sends raw text
ModelType.Chat, ModelType.Chat,
generate_endpoint, generate_endpoint,
...@@ -474,7 +474,7 @@ await register_llm( ...@@ -474,7 +474,7 @@ await register_llm(
) )
# Workers - Internal components # Workers - Internal components
await register_llm( await register_model(
ModelInput.Tokens, # Expect pre-tokenized input ModelInput.Tokens, # Expect pre-tokenized input
ModelType.Chat, # or ModelType.Prefill for prefill workers ModelType.Chat, # or ModelType.Prefill for prefill workers
generate_endpoint, generate_endpoint,
......
...@@ -115,11 +115,11 @@ class CustomEnginePublisher: ...@@ -115,11 +115,11 @@ class CustomEnginePublisher:
### Integration with Your Engine ### Integration with Your Engine
```python ```python
from dynamo.llm import register_llm from dynamo.llm import register_model
async def main(): async def main():
# Register your engine with Dynamo # Register your engine with Dynamo
component, endpoint = await register_llm( component, endpoint = await register_model(
model="my-model", model="my-model",
generator=my_generate_fn, generator=my_generate_fn,
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment