Rename fetch_llm to fetch_model (#6268)

Signed-off-by: tzulingk@nvidia.com <tzulingk@nvidia.com>

Rename fetch_llm to fetch_model (#6268)
Signed-off-by: tzulingk@nvidia.com <tzulingk@nvidia.com>
5624d144 · Tzu-Ling Kan · GitHub · 09b6ab2f · 5624d144 · 5624d144
Unverified Commit 5624d144 authored Feb 13, 2026 by Tzu-Ling Kan Committed by GitHub Feb 13, 2026
20 changed files
--- a/components/src/dynamo/frontend/main.py
+++ b/components/src/dynamo/frontend/main.py
@@ -5,7 +5,7 @@
 #
 # Start a frontend node. This runs:
 # - OpenAI HTTP server.
-# - Auto-discovery: Watches etcd for engine/worker registration (via `register_llm`).
+# - Auto-discovery: Watches etcd for engine/worker registration (via `register_model`).
 # - Pre-processor: Prompt templating and tokenization.
 # - Router, defaulting to round-robin. Use --router-mode to switch (round-robin, random, kv, direct).
 #

--- a/components/src/dynamo/frontend/vllm_processor.py
+++ b/components/src/dynamo/frontend/vllm_processor.py
@@ -32,7 +32,7 @@ from dynamo.llm import (
    PythonAsyncEngine,
    RouterConfig,
    RouterMode,
-    fetch_llm,
+    fetch_model,
 )
 from dynamo.runtime import DistributedRuntime

@@ -393,7 +393,7 @@ class EngineFactory:

        source_path = mdc.source_path()
        if not os.path.exists(source_path):
-            await fetch_llm(source_path, ignore_weights=True)
+            await fetch_model(source_path, ignore_weights=True)

        tokenizer_mode = getattr(self.flags, "tokenizer_mode", None) or "auto"
        config_format = getattr(self.flags, "config_format", None) or "auto"

--- a/components/src/dynamo/global_router/__main__.py
+++ b/components/src/dynamo/global_router/__main__.py
@@ -11,7 +11,7 @@ but internally routes requests to local routers in different namespaces based on
 a grid-based pool selection strategy.

 Key features:
- Registers as BOTH prefill AND decode worker via register_llm()
+- Registers as BOTH prefill AND decode worker via register_model()
 - Routes prefill requests based on (ISL, TTFT) to prefill pools
 - Routes decode requests based on (context_length, ITL) to decode pools
 - Connects to local routers in each pool's namespace
@@ -24,7 +24,7 @@ import os

 import uvloop

-from dynamo.llm import ModelInput, ModelType, register_llm
+from dynamo.llm import ModelInput, ModelType, register_model
 from dynamo.runtime import DistributedRuntime, dynamo_worker
 from dynamo.runtime.logging import configure_dynamo_logging

@@ -122,7 +122,7 @@ async def worker(runtime: DistributedRuntime):
    logger.info("Registering as prefill worker...")
    # Register as prefill worker - frontend will send prefill requests here
    # Use model_name as model_path since we don't need tokenizer/model files
-    await register_llm(
+    await register_model(
        model_input=ModelInput.Tokens,
        model_type=ModelType.Prefill,
        endpoint=prefill_endpoint,
@@ -135,7 +135,7 @@ async def worker(runtime: DistributedRuntime):

    logger.info("Registering as decode worker...")
    # Register as decode worker - frontend will send decode requests here
-    await register_llm(
+    await register_model(
        model_input=ModelInput.Tokens,
        model_type=ModelType.Chat | ModelType.Completions,
        endpoint=decode_endpoint,

--- a/components/src/dynamo/mocker/main.py
+++ b/components/src/dynamo/mocker/main.py
@@ -16,7 +16,7 @@ import uvloop

 os.environ.setdefault("DYN_COMPUTE_THREADS", "0")

-from dynamo.llm import EngineType, EntrypointArgs, fetch_llm, make_engine, run_input
+from dynamo.llm import EngineType, EntrypointArgs, fetch_model, make_engine, run_input
 from dynamo.runtime import DistributedRuntime
 from dynamo.runtime.logging import configure_dynamo_logging

@@ -46,7 +46,7 @@ async def prefetch_model(model_path: str) -> None:

    logger.info(f"Pre-fetching model from HuggingFace: {model_path}")
    try:
-        local_path = await fetch_llm(model_path, ignore_weights=True)
+        local_path = await fetch_model(model_path, ignore_weights=True)
        logger.info(f"Model cached at: {local_path}")
    except Exception as e:
        logger.warning(

--- a/components/src/dynamo/sglang/args.py
+++ b/components/src/dynamo/sglang/args.py
@@ -21,7 +21,7 @@ from sglang.srt.server_args_config_parser import ConfigArgumentMerger
 from dynamo._core import get_reasoning_parser_names, get_tool_parser_names
 from dynamo.common.config_dump import register_encoder
 from dynamo.common.utils.runtime import parse_endpoint
-from dynamo.llm import fetch_llm
+from dynamo.llm import fetch_model
 from dynamo.runtime.logging import configure_dynamo_logging
 from dynamo.sglang import __version__

@@ -507,17 +507,17 @@ async def parse_args(args: list[str]) -> Config:
    if not parsed_args.served_model_name:
        parsed_args.served_model_name = model_path
    # Download the model if necessary using modelexpress.
-    # We don't set `parsed_args.model_path` to the local path fetch_llm returns
+    # We don't set `parsed_args.model_path` to the local path fetch_model returns
    # because sglang will send this to its pipeline-parallel workers, which may
    # not have the local path.
    # sglang will attempt to download the model again, but find it in the HF cache.
    # For non-HF models use a path instead of an HF name, and ensure all workers have
    # that path (ideally via a shared folder).
    if not os.path.exists(model_path):
-        await fetch_llm(model_path)
+        await fetch_model(model_path)

    # TODO: sglang downloads the model in `from_cli_args`, which means we had to
-    # fetch_llm (download the model) here, in `parse_args`. `parse_args` should not
+    # fetch_model (download the model) here, in `parse_args`. `parse_args` should not
    # contain code to download a model, it should only parse the args.

    # For diffusion/video workers, create a minimal dummy ServerArgs since diffusion

--- a/components/src/dynamo/sglang/main.py
+++ b/components/src/dynamo/sglang/main.py
@@ -32,7 +32,7 @@ from dynamo.sglang.health_check import (
 from dynamo.sglang.publisher import DynamoSglangPublisher, setup_sgl_metrics
 from dynamo.sglang.register import (
    register_image_diffusion_model,
-    register_llm_with_readiness_gate,
+    register_model_with_readiness_gate,
    register_video_generation_model,
 )
 from dynamo.sglang.request_handlers import (
@@ -307,7 +307,7 @@ async def init(
                metrics_labels=metrics_labels,
                health_check_payload=health_check_payload,
            ),
-            register_llm_with_readiness_gate(
+            register_model_with_readiness_gate(
                engine,
                generate_endpoint,
                server_args,
@@ -385,7 +385,7 @@ async def init_prefill(
                metrics_labels=metrics_labels,
                health_check_payload=health_check_payload,
            ),
-            register_llm_with_readiness_gate(
+            register_model_with_readiness_gate(
                engine,
                generate_endpoint,
                server_args,
@@ -474,7 +474,7 @@ async def init_diffusion(
                metrics_labels=metrics_labels,
                health_check_payload=health_check_payload,
            ),
-            register_llm_with_readiness_gate(
+            register_model_with_readiness_gate(
                engine,
                generate_endpoint,
                server_args,
@@ -538,7 +538,7 @@ async def init_embedding(
                metrics_labels=metrics_labels,
                health_check_payload=health_check_payload,
            ),
-            register_llm_with_readiness_gate(
+            register_model_with_readiness_gate(
                engine,
                generate_endpoint,
                server_args,
@@ -766,7 +766,7 @@ async def init_multimodal_processor(
                    (prometheus_names.labels.MODEL_NAME, server_args.served_model_name),
                ],
            ),
-            register_llm_with_readiness_gate(
+            register_model_with_readiness_gate(
                None,  # engine
                generate_endpoint,
                server_args,

--- a/components/src/dynamo/sglang/register.py
+++ b/components/src/dynamo/sglang/register.py
@@ -11,11 +11,11 @@ from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import get_local_ip_auto

 from dynamo._core import Endpoint
-from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm
+from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_model
 from dynamo.sglang.args import DynamoArgs


-async def _register_llm_with_runtime_config(
+async def _register_model_with_runtime_config(
    engine: sgl.Engine,
    endpoint: Endpoint,
    server_args: ServerArgs,
@@ -49,7 +49,7 @@ async def _register_llm_with_runtime_config(
            output_type = ModelType.Chat

    try:
-        await register_llm(
+        await register_model(
            input_type,
            output_type,
            endpoint,
@@ -231,7 +231,7 @@ async def _get_runtime_config(
        return runtime_config


-async def register_llm_with_readiness_gate(
+async def register_model_with_readiness_gate(
    engine: sgl.Engine,
    generate_endpoint: Endpoint,
    server_args: ServerArgs,
@@ -254,7 +254,7 @@ async def register_llm_with_readiness_gate(
    Raises:
        RuntimeError: If model registration fails.
    """
-    registration_success = await _register_llm_with_runtime_config(
+    registration_success = await _register_model_with_runtime_config(
        engine,
        generate_endpoint,
        server_args,
@@ -295,7 +295,7 @@ async def register_image_diffusion_model(
    model_name = server_args.model_path

    try:
-        await register_llm(
+        await register_model(
            ModelInput.Text,
            ModelType.Images,
            endpoint,
@@ -335,7 +335,7 @@ async def register_video_generation_model(
    model_name = server_args.model_path

    try:
-        await register_llm(
+        await register_model(
            ModelInput.Text,
            ModelType.Videos,
            endpoint,

--- a/components/src/dynamo/trtllm/workers/llm_worker.py
+++ b/components/src/dynamo/trtllm/workers/llm_worker.py
@@ -42,7 +42,7 @@ from dynamo.llm import (
    ModelInput,
    ModelRuntimeConfig,
    ModelType,
-    register_llm,
+    register_model,
 )
 from dynamo.runtime import DistributedRuntime
 from dynamo.trtllm.constants import DisaggregationMode
@@ -437,7 +437,7 @@ async def init_llm_worker(
        # Encode workers do NOT register - they're internal workers only
        # Prefill and decode workers register - frontend detects their role via ModelType
        if config.disaggregation_mode != DisaggregationMode.ENCODE:
-            await register_llm(
+            await register_model(
                model_input,
                model_type,
                endpoint,

--- a/components/src/dynamo/trtllm/workers/video_diffusion_worker.py
+++ b/components/src/dynamo/trtllm/workers/video_diffusion_worker.py
@@ -10,7 +10,7 @@ workers using diffusion models (Wan, Flux, Cosmos, etc.).
 import asyncio
 import logging

-from dynamo.llm import ModelInput, ModelType, register_llm
+from dynamo.llm import ModelInput, ModelType, register_model
 from dynamo.runtime import DistributedRuntime
 from dynamo.trtllm.utils.trtllm_utils import Config

@@ -91,9 +91,8 @@ async def init_video_diffusion_worker(

    logging.info(f"Registering model '{model_name}' with ModelType={model_type}")

-    # register_llm is a misnomer — it's actually Dynamo's generic model
-    # registration function and the video diffisuion model is not an llm
-    await register_llm(
+    # register_model is Dynamo's generic model registration function
+    await register_model(
        ModelInput.Text,
        model_type,
        endpoint,

--- a/components/src/dynamo/vllm/handlers.py
+++ b/components/src/dynamo/vllm/handlers.py
@@ -31,8 +31,8 @@ from dynamo.llm import (
    ModelInput,
    ModelType,
    lora_name_to_id,
-    register_llm,
-    unregister_llm,
+    register_model,
+    unregister_model,
 )
 from dynamo.runtime.logging import configure_dynamo_logging

@@ -571,7 +571,7 @@ class BaseWorkerHandler(ABC):
                            }

                            # Publish with format: v1/mdc/dynamo/backend/generate/{instance_id}/{lora_slug}
-                            await register_llm(
+                            await register_model(
                                model_input=ModelInput.Tokens,
                                model_type=ModelType.Chat | ModelType.Completions,
                                endpoint=self.generate_endpoint,
@@ -691,7 +691,7 @@ class BaseWorkerHandler(ABC):
                            f"Unregistering LoRA '{lora_name}' ModelDeploymentCard"
                        )
                        try:
-                            await unregister_llm(
+                            await unregister_model(
                                endpoint=self.generate_endpoint,
                                lora_name=lora_name,
                            )

--- a/components/src/dynamo/vllm/main.py
+++ b/components/src/dynamo/vllm/main.py
@@ -28,8 +28,8 @@ from dynamo.llm import (
    ModelInput,
    ModelRuntimeConfig,
    ModelType,
-    fetch_llm,
-    register_llm,
+    fetch_model,
+    register_model,
 )

 # Optional imports for frontend decoding support
@@ -113,14 +113,14 @@ async def worker():
    # Download the model if necessary using modelexpress.
    # We want it on disk before we start vllm to avoid downloading from HuggingFace.
    #
-    # We don't set `config.engine_args.model` to the local path fetch_llm returns
+    # We don't set `config.engine_args.model` to the local path fetch_model returns
    # because vllm will send that name to its Ray pipeline-parallel workers, which
    # may not have the local path.
    # vllm will attempt to download the model again, but find it in the HF cache.
    # For non-HF models use a path instead of an HF name, and ensure all workers have
    # that path (ideally via a shared folder).
    if not os.path.exists(config.model):
-        await fetch_llm(config.model)
+        await fetch_model(config.model)

    # CHECKPOINT MODE: Load engine BEFORE runtime creation
    # This allows checkpointing GPU state before runtime connections are established
@@ -517,7 +517,7 @@ async def register_vllm_model(
        media_fetcher.timeout_ms(30000)
        media_fetcher.allow_direct_port(True)

-    await register_llm(
+    await register_model(
        model_input,
        model_type,
        generate_endpoint,
@@ -953,7 +953,7 @@ async def init_multimodal_processor(
    await encode_worker_client.wait_for_instances()

    # Register the endpoint as entrypoint to a model
-    await register_llm(
+    await register_model(
        ModelInput.Tokens,
        ModelType.Chat,
        generate_endpoint,
@@ -1141,7 +1141,7 @@ async def init_ec_processor(
    await pd_client.wait_for_instances()

    # Register the endpoint as entrypoint to a model (same as preprocessed_handler)
-    await register_llm(
+    await register_model(
        ModelInput.Tokens,  # Use Rust tokenization for better performance and multi-image support
        ModelType.Chat,
        generate_endpoint,
@@ -1324,7 +1324,7 @@ async def init_omni(
        return

    # TODO: extend for multi-stage pipelines
-    await register_llm(
+    await register_model(
        ModelInput.Text,
        ModelType.Images,
        generate_endpoint,

--- a/docs/pages/components/frontend/README.md
+++ b/docs/pages/components/frontend/README.md
@@ -34,7 +34,7 @@ The Dynamo Frontend is the API gateway for serving LLM inference requests. It pr
 python -m dynamo.frontend --http-port 8000
 ```

-This starts an OpenAI-compatible HTTP server with integrated preprocessing and routing. Backends are auto-discovered when they call `register_llm`.
+This starts an OpenAI-compatible HTTP server with integrated preprocessing and routing. Backends are auto-discovered when they call `register_model`.

 ### KServe gRPC Frontend


--- a/docs/pages/components/frontend/frontend-guide.md
+++ b/docs/pages/components/frontend/frontend-guide.md
@@ -57,7 +57,7 @@ Tune these values based on your workload. Connection window should accommodate `

 ## Registering a Backend

-Similar to HTTP frontend, the registered backend will be auto-discovered and added to the frontend list of serving model. To register a backend, the same `register_llm()` API will be used. Currently the frontend support serving of the following model type and model input combination:
+Similar to HTTP frontend, the registered backend will be auto-discovered and added to the frontend list of serving model. To register a backend, the same `register_model()` API will be used. Currently the frontend support serving of the following model type and model input combination:

 * `ModelType::Completions` and `ModelInput::Text`: Combination for LLM backend that uses custom preprocessor
 * `ModelType::Completions` and `ModelInput::Token`: Combination for LLM backend that uses Dynamo preprocessor (i.e. Dynamo vLLM / SGLang / TRTLLM backend)
@@ -153,7 +153,7 @@ See [Router Documentation](../router/README.md) for routing configuration detail

 ### With Backends

-Backends auto-register with the frontend when they call `register_llm()`. Supported backends:
+Backends auto-register with the frontend when they call `register_model()`. Supported backends:

 - [vLLM Backend](../../backends/vllm/README.md)
 - [SGLang Backend](../../backends/sglang/README.md)

--- a/docs/pages/components/router/README.md
+++ b/docs/pages/components/router/README.md
@@ -23,7 +23,7 @@ This command:
 - Exposes the service on port 8000 (configurable)
 - Automatically handles all backend workers registered to the Dynamo endpoint

-Backend workers register themselves using the `register_llm` API, after which the KV Router automatically tracks worker state and makes routing decisions based on KV cache overlap.
+Backend workers register themselves using the `register_model` API, after which the KV Router automatically tracks worker state and makes routing decisions based on KV cache overlap.

 #### CLI Arguments

@@ -83,8 +83,8 @@ For more configuration options and tuning guidelines, see the [Router Guide](rou
 ## Prerequisites and Limitations

 **Requirements:**
- **Dynamic endpoints only**: KV router requires `register_llm()` with `model_input=ModelInput.Tokens`. Your backend handler receives pre-tokenized requests with `token_ids` instead of raw text.
- Backend workers must call `register_llm()` with `model_input=ModelInput.Tokens` (see [Backend Guide](../../development/backend-guide.md))
+- **Dynamic endpoints only**: KV router requires `register_model()` with `model_input=ModelInput.Tokens`. Your backend handler receives pre-tokenized requests with `token_ids` instead of raw text.
+- Backend workers must call `register_model()` with `model_input=ModelInput.Tokens` (see [Backend Guide](../../development/backend-guide.md))
 - You cannot use `--static-endpoint` mode with KV routing (use dynamic discovery instead)

 **Multimodal Support:**

--- a/docs/pages/components/router/router-examples.md
+++ b/docs/pages/components/router/router-examples.md
@@ -377,11 +377,11 @@ class CustomEnginePublisher:
 #### Integration with Your Engine

 ```python
-from dynamo.llm import register_llm
+from dynamo.llm import register_model

 async def main():
    # Register your engine with Dynamo
-    component, endpoint = await register_llm(
+    component, endpoint = await register_model(
        model="my-model",
        generator=my_generate_fn,
    )

--- a/docs/pages/components/router/router-guide.md
+++ b/docs/pages/components/router/router-guide.md
@@ -27,7 +27,7 @@ This command:
 - Exposes the service on port 8000 (configurable)
 - Automatically handles all backend workers registered to the Dynamo endpoint

-Backend workers register themselves using the `register_llm` API, after which the KV Router automatically tracks worker state and makes routing decisions based on KV cache overlap.
+Backend workers register themselves using the `register_model` API, after which the KV Router automatically tracks worker state and makes routing decisions based on KV cache overlap.

 #### CLI Arguments

@@ -267,7 +267,7 @@ Dynamo supports disaggregated serving where prefill (prompt processing) and deco
 ### Automatic Prefill Router Activation

 The prefill router is automatically created when:
-1. A decode model is registered (e.g., via `register_llm()` with `ModelType.Chat | ModelType.Completions`)
+1. A decode model is registered (e.g., via `register_model()` with `ModelType.Chat | ModelType.Completions`)
 2. A prefill worker is detected with the same model name and `ModelType.Prefill`

 **Key characteristics of the prefill router:**
@@ -283,7 +283,7 @@ When both workers are registered, requests are automatically routed.
 # Decode worker registration (in your decode worker)
 decode_endpoint = runtime.namespace("dynamo").component("decode").endpoint("generate")

-await register_llm(
+await register_model(
    model_input=ModelInput.Tokens,
    model_type=ModelType.Chat | ModelType.Completions,
    endpoint=decode_endpoint,
@@ -296,7 +296,7 @@ await decode_endpoint.serve_endpoint(decode_handler.generate)
 # Prefill worker registration (in your prefill worker)
 prefill_endpoint = runtime.namespace("dynamo").component("prefill").endpoint("generate")

-await register_llm(
+await register_model(
    model_input=ModelInput.Tokens,
    model_type=ModelType.Prefill,  # <-- Mark as prefill worker
    endpoint=prefill_endpoint,

--- a/docs/pages/development/backend-guide.md
+++ b/docs/pages/development/backend-guide.md
@@ -17,7 +17,7 @@ The Python file must do three things:
 3. Attach a request handler

 ```
-from dynamo.llm import ModelInput, ModelType, register_llm
+from dynamo.llm import ModelInput, ModelType, register_model
 from dynamo.runtime import DistributedRuntime, dynamo_worker

   # 1. Decorate a function to get the runtime
@@ -32,8 +32,8 @@ from dynamo.runtime import DistributedRuntime, dynamo_worker
    model_input = ModelInput.Tokens # or ModelInput.Text if engine handles pre-processing
    model_type = ModelType.Chat # or ModelType.Chat | ModelType.Completions if model can be deployed on chat and completions endpoints
    endpoint = component.endpoint("endpoint")
-    # Optional last param to register_llm is model_name. If not present derives it from model_path
-    await register_llm(model_input, model_type, endpoint, model_path)
+    # Optional last param to register_model is model_name. If not present derives it from model_path
+    await register_model(model_input, model_type, endpoint, model_path)

    # Initialize your engine here
    # engine = ...
@@ -70,7 +70,7 @@ The `model_type` can be:
 - ModelType.Chat. Your `generate` method receives a `request` and must return a response dict of type [OpenAI Chat Completion](https://platform.openai.com/docs/api-reference/chat).
 - ModelType.Completions. Your `generate` method receives a `request` and must return a response dict of the older [Completions](https://platform.openai.com/docs/api-reference/completions).

-`register_llm` can also take the following kwargs:
+`register_model` can also take the following kwargs:
 - `model_name`: The name to call the model. Your incoming HTTP requests model name must match this. Defaults to the hugging face repo name or the folder name.
 - `context_length`: Max model length in tokens. Defaults to the model's set max. Only set this if you need to reduce KV cache allocation to fit into VRAM.
 - `kv_cache_block_size`: Size of a KV block for the engine, in tokens. Defaults to 16.

--- a/docs/pages/features/multimodal/multimodal-trtllm.md
+++ b/docs/pages/features/multimodal/multimodal-trtllm.md
@@ -410,7 +410,7 @@ TRT-LLM workers register with Dynamo using:

 ```python
 # TRT-LLM Worker - Register with Tokens
-await register_llm(
+await register_model(
    ModelInput.Tokens,      # Rust does minimal preprocessing
    model_type,             # ModelType.Chat or ModelType.Prefill
    generate_endpoint,

--- a/docs/pages/features/multimodal/multimodal-vllm.md
+++ b/docs/pages/features/multimodal/multimodal-vllm.md
@@ -465,7 +465,7 @@ Dynamo's Rust SDK supports two input types that determine how the HTTP frontend

 ```python
 # Processor - Entry point from HTTP frontend
-await register_llm(
+await register_model(
    ModelInput.Text,        # Frontend sends raw text
    ModelType.Chat,
    generate_endpoint,
@@ -474,7 +474,7 @@ await register_llm(
 )

 # Workers - Internal components
-await register_llm(
+await register_model(
    ModelInput.Tokens,      # Expect pre-tokenized input
    ModelType.Chat,         # or ModelType.Prefill for prefill workers
    generate_endpoint,

--- a/docs/pages/integrations/kv-events-custom-engines.md
+++ b/docs/pages/integrations/kv-events-custom-engines.md
@@ -115,11 +115,11 @@ class CustomEnginePublisher:
 ### Integration with Your Engine

 ```python
-from dynamo.llm import register_llm
+from dynamo.llm import register_model

 async def main():
    # Register your engine with Dynamo
-    component, endpoint = await register_llm(
+    component, endpoint = await register_model(
        model="my-model",
        generator=my_generate_fn,
    )