Unverified Commit 5624d144 authored by Tzu-Ling Kan's avatar Tzu-Ling Kan Committed by GitHub
Browse files

Rename fetch_llm to fetch_model (#6268)


Signed-off-by: default avatartzulingk@nvidia.com <tzulingk@nvidia.com>
parent 09b6ab2f
......@@ -12,7 +12,7 @@ import uvloop
from google.protobuf import text_format
from tritonclient.utils import triton_to_np_dtype
from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm
from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_model
from dynamo.runtime import DistributedRuntime, dynamo_worker
from dynamo.runtime.logging import configure_dynamo_logging
......@@ -147,8 +147,8 @@ async def triton_worker(runtime: DistributedRuntime, args: argparse.Namespace):
runtime_config.set_tensor_model_config(model_config)
logger.info("Attempting to register model with Dynamo runtime...")
# Use register_llm for tensor-based models (skips HuggingFace downloads)
await register_llm(
# Use register_model for tensor-based models (skips HuggingFace downloads)
await register_model(
ModelInput.Tensor,
ModelType.TensorBased,
endpoint,
......
......@@ -20,7 +20,7 @@ from vllm.outputs import RequestOutput
from vllm.tokenizers import TokenizerLike as AnyTokenizer
from vllm.utils.argparse_utils import FlexibleArgumentParser
from dynamo.llm import ModelInput, ModelType, register_llm
from dynamo.llm import ModelInput, ModelType, register_model
from dynamo.runtime import Client, DistributedRuntime, dynamo_worker
from dynamo.runtime.logging import configure_dynamo_logging
......@@ -318,7 +318,7 @@ async def init(runtime: DistributedRuntime, args: argparse.Namespace, config: Co
await encode_worker_client.wait_for_instances()
# Register the endpoint as entrypoint to a model
await register_llm(
await register_model(
ModelInput.Text, # Custom processor is used and this type bypasses SDK processor
ModelType.Chat,
generate_endpoint,
......
......@@ -8,7 +8,7 @@
# request via NATS to this python script, which runs sglang.
#
# The key differences between this and `server_sglang_tok.py` are:
# - The `register_llm` function registers us a `Chat` and `Completions` model that accepts `Tokens` input
# - The `register_model` function registers us a `Chat` and `Completions` model that accepts `Tokens` input
# - The `generate` function receives a pre-tokenized request and must return token_ids in the response.
#
# Setup a virtualenv with dynamo.llm, dynamo.runtime and sglang[all] installed
......@@ -27,7 +27,7 @@ import sglang
import uvloop
from sglang.srt.server_args import ServerArgs
from dynamo.llm import ModelInput, ModelType, register_llm
from dynamo.llm import ModelInput, ModelType, register_model
from dynamo.runtime import DistributedRuntime, dynamo_worker
DYN_NAMESPACE = os.environ.get("DYN_NAMESPACE", "dynamo")
......@@ -91,7 +91,7 @@ async def init(runtime: DistributedRuntime, config: Config):
component = runtime.namespace(config.namespace).component(config.component)
endpoint = component.endpoint(config.endpoint)
await register_llm(
await register_model(
ModelInput.Tokens,
ModelType.Chat | ModelType.Completions,
endpoint,
......
......@@ -9,7 +9,7 @@
# do the pre/post-processing.
#
# The key differences between this and `server_sglang.py` are:
# - The `register_llm` function registers us a `Chat` and `Completions` model that accepts `Text` input
# - The `register_model` function registers us a `Chat` and `Completions` model that accepts `Text` input
# - The `generate` function receives a chat completion request and must return matching response
#
# Setup a virtualenv with dynamo.llm, dynamo.runtime and sglang[all] installed
......@@ -31,7 +31,7 @@ from sglang.srt.openai_api.adapter import v1_chat_generate_request
from sglang.srt.openai_api.protocol import ChatCompletionRequest
from sglang.srt.server_args import ServerArgs
from dynamo.llm import ModelInput, ModelType, register_llm
from dynamo.llm import ModelInput, ModelType, register_model
from dynamo.runtime import DistributedRuntime, dynamo_worker
DYN_NAMESPACE = os.environ.get("DYN_NAMESPACE", "dynamo")
......@@ -104,7 +104,7 @@ async def init(runtime: DistributedRuntime, config: Config):
component = runtime.namespace(config.namespace).component(config.component)
endpoint = component.endpoint(config.endpoint)
await register_llm(
await register_model(
ModelInput.Text, ModelType.Chat | ModelType.Completions, endpoint, config.model
)
......
......@@ -27,7 +27,7 @@ from vllm.entrypoints.openai.api_server import (
)
from vllm.inputs import TokensPrompt
from dynamo.llm import ModelInput, ModelType, register_llm
from dynamo.llm import ModelInput, ModelType, register_model
from dynamo.runtime import DistributedRuntime, dynamo_worker
DYN_NAMESPACE = os.environ.get("DYN_NAMESPACE", "dynamo")
......@@ -102,7 +102,7 @@ async def init(runtime: DistributedRuntime, config: Config):
component = runtime.namespace(config.namespace).component(config.component)
endpoint = component.endpoint(config.endpoint)
await register_llm(
await register_model(
ModelInput.Tokens,
ModelType.Chat | ModelType.Completions,
endpoint,
......
......@@ -142,9 +142,9 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(llm::kv::compute_block_hash_for_seq_py, m)?)?;
m.add_function(wrap_pyfunction!(lora_name_to_id, m)?)?;
m.add_function(wrap_pyfunction!(log_message, m)?)?;
m.add_function(wrap_pyfunction!(register_llm, m)?)?;
m.add_function(wrap_pyfunction!(unregister_llm, m)?)?;
m.add_function(wrap_pyfunction!(fetch_llm, m)?)?;
m.add_function(wrap_pyfunction!(register_model, m)?)?;
m.add_function(wrap_pyfunction!(unregister_model, m)?)?;
m.add_function(wrap_pyfunction!(fetch_model, m)?)?;
m.add_function(wrap_pyfunction!(llm::entrypoint::make_engine, m)?)?;
m.add_function(wrap_pyfunction!(llm::entrypoint::run_input, m)?)?;
......@@ -228,7 +228,7 @@ fn lora_name_to_id(lora_name: &str) -> i32 {
#[pyfunction]
#[pyo3(signature = (model_input, model_type, endpoint, model_path, model_name=None, context_length=None, kv_cache_block_size=None, router_mode=None, runtime_config=None, user_data=None, custom_template_path=None, media_decoder=None, media_fetcher=None, lora_name=None, base_model_path=None))]
#[allow(clippy::too_many_arguments)]
fn register_llm<'p>(
fn register_model<'p>(
py: Python<'p>,
model_input: ModelInput,
model_type: ModelType,
......@@ -409,7 +409,7 @@ fn register_llm<'p>(
/// - LoRA model: `v1/mdc/{namespace}/{component}/{endpoint}/{instance_id}/{lora_slug}`
#[pyfunction]
#[pyo3(signature = (endpoint, lora_name=None))]
fn unregister_llm<'p>(
fn unregister_model<'p>(
py: Python<'p>,
endpoint: Endpoint,
lora_name: Option<&str>,
......@@ -425,11 +425,11 @@ fn unregister_llm<'p>(
})
}
/// Download a model from Hugging Face, returning it's local path
/// Example: `model_path = await fetch_llm("Qwen/Qwen3-0.6B")`
/// Download a model from Hugging Face, returning its local path
/// Example: `model_path = await fetch_model("Qwen/Qwen3-0.6B")`
#[pyfunction]
#[pyo3(signature = (remote_name, ignore_weights=false))]
fn fetch_llm<'p>(
fn fetch_model<'p>(
py: Python<'p>,
remote_name: &str,
ignore_weights: bool,
......
......@@ -1011,7 +1011,7 @@ class KvRouterConfig:
"""
...
async def register_llm(
async def register_model(
model_input: ModelInput,
model_type: ModelType,
endpoint: Endpoint,
......@@ -1040,7 +1040,7 @@ async def register_llm(
"""
...
async def unregister_llm(
async def unregister_model(
endpoint: Endpoint,
lora_name: Optional[str] = None,
) -> None:
......@@ -1055,14 +1055,19 @@ def lora_name_to_id(lora_name: str) -> int:
"""Generate a deterministic integer ID from a LoRA name using blake3 hash."""
...
async def fetch_llm(remote_name: str, ignore_weights: bool = False) -> str:
async def fetch_model(remote_name: str, ignore_weights: bool = False) -> str:
"""
Download a model from Hugging Face, returning it's local path.
Download a model from Hugging Face, returning its local path.
If `ignore_weights` is True, only fetches tokenizer and config files.
Example: `model_path = await fetch_llm("Qwen/Qwen3-0.6B")`
Example: `model_path = await fetch_model("Qwen/Qwen3-0.6B")`
"""
...
# Backward-compatible aliases (deprecated, use new names)
fetch_llm = fetch_model
register_llm = register_model
unregister_llm = unregister_model
class EngineConfig:
"""Holds internal configuration for a Dynamo engine."""
...
......
......@@ -29,11 +29,16 @@ from dynamo._core import RouterMode as RouterMode
from dynamo._core import WorkerMetricsPublisher as WorkerMetricsPublisher
from dynamo._core import ZmqKvEventListener as ZmqKvEventListener
from dynamo._core import compute_block_hash_for_seq as compute_block_hash_for_seq
from dynamo._core import fetch_llm as fetch_llm
from dynamo._core import fetch_model as fetch_model
from dynamo._core import lora_name_to_id as lora_name_to_id
from dynamo._core import make_engine
from dynamo._core import register_llm as register_llm
from dynamo._core import register_model as register_model
from dynamo._core import run_input
from dynamo._core import unregister_llm as unregister_llm
from dynamo._core import unregister_model as unregister_model
from .exceptions import HttpError
# Backward-compatible aliases
fetch_llm = fetch_model
register_llm = register_model
unregister_llm = unregister_model
......@@ -8,7 +8,7 @@ import os
import pytest
import uvloop
from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm
from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_model
from dynamo.runtime import DistributedRuntime
TEST_END_TO_END = os.environ.get("TEST_END_TO_END", 0)
......@@ -34,8 +34,8 @@ async def test_register(runtime: DistributedRuntime):
assert model_config == runtime_config.get_tensor_model_config()
# Use register_llm for tensor-based backends (skips HuggingFace downloads)
await register_llm(
# Use register_model for tensor-based backends (skips HuggingFace downloads)
await register_model(
ModelInput.Tensor,
ModelType.TensorBased,
endpoint,
......
......@@ -33,7 +33,7 @@ If `enable_image` or `enable_video` are not called, requests containing the corr
Register the LLM as usual, adding the media configuration:
```python
register_llm(
register_model(
...,
media_decoder=decoder,
media_fetcher=fetcher,
......
......@@ -9,7 +9,7 @@
import tritonclient.grpc.model_config_pb2 as mc
import uvloop
from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm
from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_model
from dynamo.runtime import DistributedRuntime, dynamo_worker
......@@ -53,8 +53,8 @@ async def echo_tensor_worker(runtime: DistributedRuntime):
)
assert model_config == retrieved_model_config
# Use register_llm for tensor-based backends (skips HuggingFace downloads)
await register_llm(
# Use register_model for tensor-based backends (skips HuggingFace downloads)
await register_model(
ModelInput.Tensor,
ModelType.TensorBased,
endpoint,
......
......@@ -9,7 +9,7 @@ import uvloop
from transformers import AutoTokenizer
from dynamo.common.utils.paths import WORKSPACE_DIR
from dynamo.llm import ModelInput, ModelType, register_llm
from dynamo.llm import ModelInput, ModelType, register_model
from dynamo.runtime import DistributedRuntime, dynamo_worker
SERVE_TEST_DIR = os.path.join(WORKSPACE_DIR, "tests/serve")
......@@ -54,7 +54,7 @@ async def main(runtime: DistributedRuntime):
# Register model with custom template
model_name = "Qwen/Qwen3-0.6B"
await register_llm(
await register_model(
ModelInput.Tokens,
ModelType.Chat,
endpoint,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment