Rename fetch_llm to fetch_model (#6268)

Signed-off-by: tzulingk@nvidia.com <tzulingk@nvidia.com>

Rename fetch_llm to fetch_model (#6268)
Signed-off-by: tzulingk@nvidia.com <tzulingk@nvidia.com>
5624d144 · Tzu-Ling Kan · GitHub · 09b6ab2f · 5624d144 · 5624d144
Unverified Commit 5624d144 authored Feb 13, 2026 by Tzu-Ling Kan Committed by GitHub Feb 13, 2026
12 changed files
--- a/examples/backends/tritonserver/src/tritonworker.py
+++ b/examples/backends/tritonserver/src/tritonworker.py
@@ -12,7 +12,7 @@ import uvloop
 from google.protobuf import text_format
 from tritonclient.utils import triton_to_np_dtype

-from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm
+from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_model
 from dynamo.runtime import DistributedRuntime, dynamo_worker
 from dynamo.runtime.logging import configure_dynamo_logging

@@ -147,8 +147,8 @@ async def triton_worker(runtime: DistributedRuntime, args: argparse.Namespace):
    runtime_config.set_tensor_model_config(model_config)

    logger.info("Attempting to register model with Dynamo runtime...")
-    # Use register_llm for tensor-based models (skips HuggingFace downloads)
-    await register_llm(
+    # Use register_model for tensor-based models (skips HuggingFace downloads)
+    await register_model(
        ModelInput.Tensor,
        ModelType.TensorBased,
        endpoint,

--- a/examples/multimodal/components/processor.py
+++ b/examples/multimodal/components/processor.py
@@ -20,7 +20,7 @@ from vllm.outputs import RequestOutput
 from vllm.tokenizers import TokenizerLike as AnyTokenizer
 from vllm.utils.argparse_utils import FlexibleArgumentParser

-from dynamo.llm import ModelInput, ModelType, register_llm
+from dynamo.llm import ModelInput, ModelType, register_model
 from dynamo.runtime import Client, DistributedRuntime, dynamo_worker
 from dynamo.runtime.logging import configure_dynamo_logging

@@ -318,7 +318,7 @@ async def init(runtime: DistributedRuntime, args: argparse.Namespace, config: Co
    await encode_worker_client.wait_for_instances()

    # Register the endpoint as entrypoint to a model
-    await register_llm(
+    await register_model(
        ModelInput.Text,  # Custom processor is used and this type bypasses SDK processor
        ModelType.Chat,
        generate_endpoint,

--- a/lib/bindings/python/examples/hello_world/server_sglang.py
+++ b/lib/bindings/python/examples/hello_world/server_sglang.py
@@ -8,7 +8,7 @@
 # request via NATS to this python script, which runs sglang.
 #
 # The key differences between this and `server_sglang_tok.py` are:
-# - The `register_llm` function registers us a `Chat` and `Completions` model that accepts `Tokens` input
+# - The `register_model` function registers us a `Chat` and `Completions` model that accepts `Tokens` input
 # - The `generate` function receives a pre-tokenized request and must return token_ids in the response.
 #
 # Setup a virtualenv with dynamo.llm, dynamo.runtime and sglang[all] installed
@@ -27,7 +27,7 @@ import sglang
 import uvloop
 from sglang.srt.server_args import ServerArgs

-from dynamo.llm import ModelInput, ModelType, register_llm
+from dynamo.llm import ModelInput, ModelType, register_model
 from dynamo.runtime import DistributedRuntime, dynamo_worker

 DYN_NAMESPACE = os.environ.get("DYN_NAMESPACE", "dynamo")
@@ -91,7 +91,7 @@ async def init(runtime: DistributedRuntime, config: Config):
    component = runtime.namespace(config.namespace).component(config.component)

    endpoint = component.endpoint(config.endpoint)
-    await register_llm(
+    await register_model(
        ModelInput.Tokens,
        ModelType.Chat | ModelType.Completions,
        endpoint,

--- a/lib/bindings/python/examples/hello_world/server_sglang_tok.py
+++ b/lib/bindings/python/examples/hello_world/server_sglang_tok.py
@@ -9,7 +9,7 @@
 # do the pre/post-processing.
 #
 # The key differences between this and `server_sglang.py` are:
-# - The `register_llm` function registers us a `Chat` and `Completions` model that accepts `Text` input
+# - The `register_model` function registers us a `Chat` and `Completions` model that accepts `Text` input
 # - The `generate` function receives a chat completion request and must return matching response
 #
 # Setup a virtualenv with dynamo.llm, dynamo.runtime and sglang[all] installed
@@ -31,7 +31,7 @@ from sglang.srt.openai_api.adapter import v1_chat_generate_request
 from sglang.srt.openai_api.protocol import ChatCompletionRequest
 from sglang.srt.server_args import ServerArgs

-from dynamo.llm import ModelInput, ModelType, register_llm
+from dynamo.llm import ModelInput, ModelType, register_model
 from dynamo.runtime import DistributedRuntime, dynamo_worker

 DYN_NAMESPACE = os.environ.get("DYN_NAMESPACE", "dynamo")
@@ -104,7 +104,7 @@ async def init(runtime: DistributedRuntime, config: Config):
    component = runtime.namespace(config.namespace).component(config.component)

    endpoint = component.endpoint(config.endpoint)
-    await register_llm(
+    await register_model(
        ModelInput.Text, ModelType.Chat | ModelType.Completions, endpoint, config.model
    )


--- a/lib/bindings/python/examples/hello_world/server_vllm.py
+++ b/lib/bindings/python/examples/hello_world/server_vllm.py
@@ -27,7 +27,7 @@ from vllm.entrypoints.openai.api_server import (
 )
 from vllm.inputs import TokensPrompt

-from dynamo.llm import ModelInput, ModelType, register_llm
+from dynamo.llm import ModelInput, ModelType, register_model
 from dynamo.runtime import DistributedRuntime, dynamo_worker

 DYN_NAMESPACE = os.environ.get("DYN_NAMESPACE", "dynamo")
@@ -102,7 +102,7 @@ async def init(runtime: DistributedRuntime, config: Config):
    component = runtime.namespace(config.namespace).component(config.component)

    endpoint = component.endpoint(config.endpoint)
-    await register_llm(
+    await register_model(
        ModelInput.Tokens,
        ModelType.Chat | ModelType.Completions,
        endpoint,

--- a/lib/bindings/python/rust/lib.rs
+++ b/lib/bindings/python/rust/lib.rs
@@ -142,9 +142,9 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
    m.add_function(wrap_pyfunction!(llm::kv::compute_block_hash_for_seq_py, m)?)?;
    m.add_function(wrap_pyfunction!(lora_name_to_id, m)?)?;
    m.add_function(wrap_pyfunction!(log_message, m)?)?;
-    m.add_function(wrap_pyfunction!(register_llm, m)?)?;
-    m.add_function(wrap_pyfunction!(unregister_llm, m)?)?;
-    m.add_function(wrap_pyfunction!(fetch_llm, m)?)?;
+    m.add_function(wrap_pyfunction!(register_model, m)?)?;
+    m.add_function(wrap_pyfunction!(unregister_model, m)?)?;
+    m.add_function(wrap_pyfunction!(fetch_model, m)?)?;
    m.add_function(wrap_pyfunction!(llm::entrypoint::make_engine, m)?)?;
    m.add_function(wrap_pyfunction!(llm::entrypoint::run_input, m)?)?;

@@ -228,7 +228,7 @@ fn lora_name_to_id(lora_name: &str) -> i32 {
 #[pyfunction]
 #[pyo3(signature = (model_input, model_type, endpoint, model_path, model_name=None, context_length=None, kv_cache_block_size=None, router_mode=None, runtime_config=None, user_data=None, custom_template_path=None, media_decoder=None, media_fetcher=None, lora_name=None, base_model_path=None))]
 #[allow(clippy::too_many_arguments)]
-fn register_llm<'p>(
+fn register_model<'p>(
    py: Python<'p>,
    model_input: ModelInput,
    model_type: ModelType,
@@ -409,7 +409,7 @@ fn register_llm<'p>(
 /// - LoRA model: `v1/mdc/{namespace}/{component}/{endpoint}/{instance_id}/{lora_slug}`
 #[pyfunction]
 #[pyo3(signature = (endpoint, lora_name=None))]
-fn unregister_llm<'p>(
+fn unregister_model<'p>(
    py: Python<'p>,
    endpoint: Endpoint,
    lora_name: Option<&str>,
@@ -425,11 +425,11 @@ fn unregister_llm<'p>(
    })
 }

-/// Download a model from Hugging Face, returning it's local path
-/// Example: `model_path = await fetch_llm("Qwen/Qwen3-0.6B")`
+/// Download a model from Hugging Face, returning its local path
+/// Example: `model_path = await fetch_model("Qwen/Qwen3-0.6B")`
 #[pyfunction]
 #[pyo3(signature = (remote_name, ignore_weights=false))]
-fn fetch_llm<'p>(
+fn fetch_model<'p>(
    py: Python<'p>,
    remote_name: &str,
    ignore_weights: bool,

--- a/lib/bindings/python/src/dynamo/_core.pyi
+++ b/lib/bindings/python/src/dynamo/_core.pyi
@@ -1011,7 +1011,7 @@ class KvRouterConfig:
        """
        ...

-async def register_llm(
+async def register_model(
    model_input: ModelInput,
    model_type: ModelType,
    endpoint: Endpoint,
@@ -1040,7 +1040,7 @@ async def register_llm(
    """
    ...

-async def unregister_llm(
+async def unregister_model(
    endpoint: Endpoint,
    lora_name: Optional[str] = None,
 ) -> None:
@@ -1055,14 +1055,19 @@ def lora_name_to_id(lora_name: str) -> int:
    """Generate a deterministic integer ID from a LoRA name using blake3 hash."""
    ...

-async def fetch_llm(remote_name: str, ignore_weights: bool = False) -> str:
+async def fetch_model(remote_name: str, ignore_weights: bool = False) -> str:
    """
-    Download a model from Hugging Face, returning it's local path.
+    Download a model from Hugging Face, returning its local path.
    If `ignore_weights` is True, only fetches tokenizer and config files.
-    Example: `model_path = await fetch_llm("Qwen/Qwen3-0.6B")`
+    Example: `model_path = await fetch_model("Qwen/Qwen3-0.6B")`
    """
    ...

+# Backward-compatible aliases (deprecated, use new names)
+fetch_llm = fetch_model
+register_llm = register_model
+unregister_llm = unregister_model
+
 class EngineConfig:
    """Holds internal configuration for a Dynamo engine."""
    ...

--- a/lib/bindings/python/src/dynamo/llm/__init__.py
+++ b/lib/bindings/python/src/dynamo/llm/__init__.py
@@ -29,11 +29,16 @@ from dynamo._core import RouterMode as RouterMode
 from dynamo._core import WorkerMetricsPublisher as WorkerMetricsPublisher
 from dynamo._core import ZmqKvEventListener as ZmqKvEventListener
 from dynamo._core import compute_block_hash_for_seq as compute_block_hash_for_seq
-from dynamo._core import fetch_llm as fetch_llm
+from dynamo._core import fetch_model as fetch_model
 from dynamo._core import lora_name_to_id as lora_name_to_id
 from dynamo._core import make_engine
-from dynamo._core import register_llm as register_llm
+from dynamo._core import register_model as register_model
 from dynamo._core import run_input
-from dynamo._core import unregister_llm as unregister_llm
+from dynamo._core import unregister_model as unregister_model

 from .exceptions import HttpError
+
+# Backward-compatible aliases
+fetch_llm = fetch_model
+register_llm = register_model
+unregister_llm = unregister_model
--- a/lib/bindings/python/tests/test_tensor.py
+++ b/lib/bindings/python/tests/test_tensor.py
@@ -8,7 +8,7 @@ import os
 import pytest
 import uvloop

-from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm
+from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_model
 from dynamo.runtime import DistributedRuntime

 TEST_END_TO_END = os.environ.get("TEST_END_TO_END", 0)
@@ -34,8 +34,8 @@ async def test_register(runtime: DistributedRuntime):

    assert model_config == runtime_config.get_tensor_model_config()

-    # Use register_llm for tensor-based backends (skips HuggingFace downloads)
-    await register_llm(
+    # Use register_model for tensor-based backends (skips HuggingFace downloads)
+    await register_model(
        ModelInput.Tensor,
        ModelType.TensorBased,
        endpoint,

--- a/lib/llm/src/preprocessor/media/README.md
+++ b/lib/llm/src/preprocessor/media/README.md
@@ -33,7 +33,7 @@ If `enable_image` or `enable_video` are not called, requests containing the corr
 Register the LLM as usual, adding the media configuration:

 ```python
-register_llm(
+register_model(
  ...,
  media_decoder=decoder,
  media_fetcher=fetcher,

--- a/tests/frontend/grpc/echo_tensor_worker.py
+++ b/tests/frontend/grpc/echo_tensor_worker.py
@@ -9,7 +9,7 @@
 import tritonclient.grpc.model_config_pb2 as mc
 import uvloop

-from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm
+from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_model
 from dynamo.runtime import DistributedRuntime, dynamo_worker


@@ -53,8 +53,8 @@ async def echo_tensor_worker(runtime: DistributedRuntime):
    )
    assert model_config == retrieved_model_config

-    # Use register_llm for tensor-based backends (skips HuggingFace downloads)
-    await register_llm(
+    # Use register_model for tensor-based backends (skips HuggingFace downloads)
+    await register_model(
        ModelInput.Tensor,
        ModelType.TensorBased,
        endpoint,

--- a/tests/serve/launch/template_verifier.py
+++ b/tests/serve/launch/template_verifier.py
@@ -9,7 +9,7 @@ import uvloop
 from transformers import AutoTokenizer

 from dynamo.common.utils.paths import WORKSPACE_DIR
-from dynamo.llm import ModelInput, ModelType, register_llm
+from dynamo.llm import ModelInput, ModelType, register_model
 from dynamo.runtime import DistributedRuntime, dynamo_worker

 SERVE_TEST_DIR = os.path.join(WORKSPACE_DIR, "tests/serve")
@@ -54,7 +54,7 @@ async def main(runtime: DistributedRuntime):

    # Register model with custom template
    model_name = "Qwen/Qwen3-0.6B"
-    await register_llm(
+    await register_model(
        ModelInput.Tokens,
        ModelType.Chat,
        endpoint,