Unverified Commit 99cd9d85 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

feat: dynamo-run <-> python interop (#934)

Adding this to a Python script makes it register on the network so that `dynamo-run` can discover it and send it requests:
```
from dynamo.llm import register_llm

MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
await register_llm(endpoint, MODEL, 3)
```

Full vllm example, with pre-processing in dynamo:
- `dynamo-run in=text out=dyn://dynamo.backend.generate`
- `cd lib/bindings/python/examples/hello_world`
- `python server_vllm.py`

This builds on top of the work to move pre-processor to ingress side. It means we can decouple Rust and Python using NATS as the bus.

The `register_llm` call does this:

- Download the model from HF if necessary
- Load the model deployment card from the HF folder or extract from GGUF
- Push the tokenizer config etc into NATS object store so ingress can access it from a different machine
- Publish the model deployment card to ETCD
parent 829e1cf5
...@@ -1678,7 +1678,6 @@ dependencies = [ ...@@ -1678,7 +1678,6 @@ dependencies = [
"dynamo-runtime", "dynamo-runtime",
"futures", "futures",
"futures-util", "futures-util",
"hf-hub",
"humantime", "humantime",
"netlink-packet-route", "netlink-packet-route",
"rtnetlink", "rtnetlink",
......
...@@ -53,7 +53,6 @@ anyhow = { workspace = true } ...@@ -53,7 +53,6 @@ anyhow = { workspace = true }
async-stream = { workspace = true } async-stream = { workspace = true }
async-trait = { workspace = true } async-trait = { workspace = true }
futures = { workspace = true } futures = { workspace = true }
hf-hub = { workspace = true }
humantime = { workspace = true } humantime = { workspace = true }
serde = { workspace = true } serde = { workspace = true }
serde_json = { workspace = true } serde_json = { workspace = true }
......
...@@ -66,7 +66,7 @@ struct Entry { ...@@ -66,7 +66,7 @@ struct Entry {
pub async fn run( pub async fn run(
runtime: Runtime, runtime: Runtime,
flags: Flags, flags: Flags,
maybe_card: Option<ModelDeploymentCard>, card: ModelDeploymentCard,
input_jsonl: PathBuf, input_jsonl: PathBuf,
engine_config: EngineConfig, engine_config: EngineConfig,
template: Option<RequestTemplate>, template: Option<RequestTemplate>,
...@@ -83,7 +83,7 @@ pub async fn run( ...@@ -83,7 +83,7 @@ pub async fn run(
let prepared_engine = common::prepare_engine(runtime, flags, engine_config).await?; let prepared_engine = common::prepare_engine(runtime, flags, engine_config).await?;
let service_name_ref = Arc::new(prepared_engine.service_name); let service_name_ref = Arc::new(prepared_engine.service_name);
let pre_processor = if let Some(card) = maybe_card { let pre_processor = if card.has_tokenizer() {
Some(OpenAIPreprocessor::new(card).await?) Some(OpenAIPreprocessor::new(card).await?)
} else { } else {
None None
......
...@@ -153,11 +153,8 @@ pub async fn prepare_engine( ...@@ -153,11 +153,8 @@ pub async fn prepare_engine(
_cache_dir: cache_dir, _cache_dir: cache_dir,
}) })
} }
EngineConfig::StaticFull { EngineConfig::StaticFull { engine, model } => {
service_name, let service_name = model.service_name().to_string();
engine,
card: _card,
} => {
tracing::debug!("Model: {service_name} with engine pre-processing"); tracing::debug!("Model: {service_name} with engine pre-processing");
let engine = Arc::new(StreamingEngineAdapter::new(engine)); let engine = Arc::new(StreamingEngineAdapter::new(engine));
Ok(PreparedEngine { Ok(PreparedEngine {
...@@ -168,16 +165,16 @@ pub async fn prepare_engine( ...@@ -168,16 +165,16 @@ pub async fn prepare_engine(
}) })
} }
EngineConfig::StaticCore { EngineConfig::StaticCore {
service_name,
engine: inner_engine, engine: inner_engine,
card, model,
} => { } => {
let pipeline = build_pipeline::< let pipeline = build_pipeline::<
NvCreateChatCompletionRequest, NvCreateChatCompletionRequest,
NvCreateChatCompletionStreamResponse, NvCreateChatCompletionStreamResponse,
>(&card, inner_engine) >(model.card(), inner_engine)
.await?; .await?;
let service_name = model.service_name().to_string();
tracing::debug!("Model: {service_name} with Dynamo pre-processing"); tracing::debug!("Model: {service_name} with Dynamo pre-processing");
Ok(PreparedEngine { Ok(PreparedEngine {
service_name, service_name,
...@@ -236,7 +233,7 @@ mod tests { ...@@ -236,7 +233,7 @@ mod tests {
#[tokio::test] #[tokio::test]
async fn test_build_chat_completions_pipeline_core_engine_succeeds() -> anyhow::Result<()> { async fn test_build_chat_completions_pipeline_core_engine_succeeds() -> anyhow::Result<()> {
// Create test model card // Create test model card
let card = ModelDeploymentCard::from_local_path(HF_PATH, None).await?; let card = ModelDeploymentCard::load(HF_PATH).await?;
let engine = dynamo_llm::engines::make_engine_core(); let engine = dynamo_llm::engines::make_engine_core();
// Build pipeline for chat completions // Build pipeline for chat completions
...@@ -255,7 +252,7 @@ mod tests { ...@@ -255,7 +252,7 @@ mod tests {
#[tokio::test] #[tokio::test]
async fn test_build_completions_pipeline_core_engine_succeeds() -> anyhow::Result<()> { async fn test_build_completions_pipeline_core_engine_succeeds() -> anyhow::Result<()> {
// Create test model card // Create test model card
let card = ModelDeploymentCard::from_local_path(HF_PATH, None).await?; let card = ModelDeploymentCard::load(HF_PATH).await?;
let engine = dynamo_llm::engines::make_engine_core(); let engine = dynamo_llm::engines::make_engine_core();
// Build pipeline for completions // Build pipeline for completions
......
...@@ -18,9 +18,6 @@ use std::{pin::Pin, sync::Arc}; ...@@ -18,9 +18,6 @@ use std::{pin::Pin, sync::Arc};
use dynamo_llm::{ use dynamo_llm::{
backend::Backend, backend::Backend,
engines::StreamingEngineAdapter, engines::StreamingEngineAdapter,
http::service::discovery::{ModelEntry, ModelNetworkName},
key_value_store::{EtcdStorage, KeyValueStore, KeyValueStoreManager},
model_card::{self, ModelDeploymentCard},
model_type::ModelType, model_type::ModelType,
preprocessor::{BackendInput, BackendOutput}, preprocessor::{BackendInput, BackendOutput},
types::{ types::{
...@@ -30,10 +27,10 @@ use dynamo_llm::{ ...@@ -30,10 +27,10 @@ use dynamo_llm::{
Annotated, Annotated,
}, },
}; };
use dynamo_runtime::engine::AsyncEngineStream;
use dynamo_runtime::pipeline::{ use dynamo_runtime::pipeline::{
network::Ingress, Context, ManyOut, Operator, SegmentSource, ServiceBackend, SingleIn, Source, network::Ingress, Context, ManyOut, Operator, SegmentSource, ServiceBackend, SingleIn, Source,
}; };
use dynamo_runtime::{component::Endpoint, engine::AsyncEngineStream};
use dynamo_runtime::{protocols::Endpoint as EndpointId, DistributedRuntime}; use dynamo_runtime::{protocols::Endpoint as EndpointId, DistributedRuntime};
use crate::EngineConfig; use crate::EngineConfig;
...@@ -46,62 +43,50 @@ pub async fn run( ...@@ -46,62 +43,50 @@ pub async fn run(
let cancel_token = distributed_runtime.primary_token().clone(); let cancel_token = distributed_runtime.primary_token().clone();
let endpoint_id: EndpointId = path.parse()?; let endpoint_id: EndpointId = path.parse()?;
let component = distributed_runtime
.namespace(&endpoint_id.namespace)?
.component(&endpoint_id.component)?;
let endpoint = component
.service_builder()
.create()
.await?
.endpoint(&endpoint_id.name);
let (rt_fut, mut card) = match engine_config { let (rt_fut, mut card) = match engine_config {
EngineConfig::StaticFull { EngineConfig::StaticFull { engine, mut model } => {
service_name,
engine,
mut card,
} => {
let engine = Arc::new(StreamingEngineAdapter::new(engine)); let engine = Arc::new(StreamingEngineAdapter::new(engine));
card.requires_preprocessing = false;
let ingress_chat = Ingress::< let ingress_chat = Ingress::<
Context<NvCreateChatCompletionRequest>, Context<NvCreateChatCompletionRequest>,
Pin<Box<dyn AsyncEngineStream<Annotated<NvCreateChatCompletionStreamResponse>>>>, Pin<Box<dyn AsyncEngineStream<Annotated<NvCreateChatCompletionStreamResponse>>>>,
>::for_engine(engine)?; >::for_engine(engine)?;
let endpoint_chat = register(
distributed_runtime.clone(),
&service_name,
endpoint_id,
*card.clone(),
ModelType::Chat,
)
.await?;
let fut_chat = endpoint_chat
.endpoint_builder()
.handler(ingress_chat)
.start();
(fut_chat, card) model.attach(&endpoint, ModelType::Chat).await?;
let fut_chat = endpoint.endpoint_builder().handler(ingress_chat).start();
(fut_chat, model.card().clone())
} }
EngineConfig::StaticCore { EngineConfig::StaticCore {
service_name,
engine: inner_engine, engine: inner_engine,
mut card, mut model,
} => { } => {
// Pre-processing is done ingress-side, so it should be already done. // Pre-processing is done ingress-side, so it should be already done.
let frontend = let frontend =
SegmentSource::<SingleIn<BackendInput>, ManyOut<Annotated<BackendOutput>>>::new(); SegmentSource::<SingleIn<BackendInput>, ManyOut<Annotated<BackendOutput>>>::new();
let backend = Backend::from_mdc(*card.clone()).await?.into_operator(); let backend = Backend::from_mdc(model.card().clone())
.await?
.into_operator();
let engine = ServiceBackend::from_engine(inner_engine); let engine = ServiceBackend::from_engine(inner_engine);
let pipeline = frontend let pipeline = frontend
.link(backend.forward_edge())? .link(backend.forward_edge())?
.link(engine)? .link(engine)?
.link(backend.backward_edge())? .link(backend.backward_edge())?
.link(frontend)?; .link(frontend)?;
let ingress = Ingress::for_pipeline(pipeline)?; let ingress = Ingress::for_pipeline(pipeline)?;
card.requires_preprocessing = true;
let endpoint = register( model.attach(&endpoint, ModelType::Backend).await?;
distributed_runtime.clone(), let fut = endpoint.endpoint_builder().handler(ingress).start();
&service_name,
endpoint_id, (fut, model.card().clone())
*card.clone(),
ModelType::Backend,
)
.await?;
(endpoint.endpoint_builder().handler(ingress).start(), card)
} }
EngineConfig::Dynamic(_) => { EngineConfig::Dynamic(_) => {
anyhow::bail!("Cannot use endpoint for both in and out"); anyhow::bail!("Cannot use endpoint for both in and out");
...@@ -127,54 +112,3 @@ pub async fn run( ...@@ -127,54 +112,3 @@ pub async fn run(
Ok(()) Ok(())
} }
async fn register(
distributed_runtime: DistributedRuntime,
service_name: &str,
endpoint_id: EndpointId,
mut card: ModelDeploymentCard,
model_type: ModelType,
) -> anyhow::Result<Endpoint> {
let component = distributed_runtime
.namespace(&endpoint_id.namespace)?
.component(&endpoint_id.component)?;
let endpoint = component
.service_builder()
.create()
.await?
.endpoint(&endpoint_id.name);
// A static component doesn't have an etcd_client because it doesn't need to register
if let Some(etcd_client) = distributed_runtime.etcd_client() {
// Store model config files in NATS object store
let nats_client = distributed_runtime.nats_client();
card.move_to_nats(nats_client.clone()).await?;
// Publish the Model Deployment Card to etcd
let kvstore: Box<dyn KeyValueStore> =
Box::new(EtcdStorage::new(etcd_client.clone(), endpoint_id.clone()));
let card_store = Arc::new(KeyValueStoreManager::new(kvstore));
let key = card.slug().to_string();
card_store
.publish(model_card::BUCKET_NAME, None, &key, &mut card)
.await?;
// Publish our ModelEntry to etcd. This allows ingress to find the model card.
// (Why don't we put the model card directly under this key?)
let network_name = ModelNetworkName::from_local(&endpoint, etcd_client.lease_id());
tracing::debug!("Registering with etcd as {network_name}");
let model_registration = ModelEntry {
name: service_name.to_string(),
endpoint: endpoint_id.clone(),
model_type,
};
etcd_client
.kv_create(
network_name.to_string(),
serde_json::to_vec_pretty(&model_registration)?,
None, // use primary lease
)
.await?;
}
Ok(endpoint)
}
...@@ -71,36 +71,31 @@ pub async fn run( ...@@ -71,36 +71,31 @@ pub async fn run(
} }
} }
} }
EngineConfig::StaticFull { EngineConfig::StaticFull { engine, model } => {
service_name,
engine,
..
} => {
let engine = Arc::new(StreamingEngineAdapter::new(engine)); let engine = Arc::new(StreamingEngineAdapter::new(engine));
let manager = http_service.model_manager(); let manager = http_service.model_manager();
manager.add_completions_model(&service_name, engine.clone())?; manager.add_completions_model(model.service_name(), engine.clone())?;
manager.add_chat_completions_model(&service_name, engine)?; manager.add_chat_completions_model(model.service_name(), engine)?;
} }
EngineConfig::StaticCore { EngineConfig::StaticCore {
service_name,
engine: inner_engine, engine: inner_engine,
card, model,
} => { } => {
let manager = http_service.model_manager(); let manager = http_service.model_manager();
let chat_pipeline = common::build_pipeline::< let chat_pipeline = common::build_pipeline::<
NvCreateChatCompletionRequest, NvCreateChatCompletionRequest,
NvCreateChatCompletionStreamResponse, NvCreateChatCompletionStreamResponse,
>(&card, inner_engine.clone()) >(model.card(), inner_engine.clone())
.await?; .await?;
manager.add_chat_completions_model(&service_name, chat_pipeline)?; manager.add_chat_completions_model(model.service_name(), chat_pipeline)?;
let cmpl_pipeline = common::build_pipeline::<CompletionRequest, CompletionResponse>( let cmpl_pipeline = common::build_pipeline::<CompletionRequest, CompletionResponse>(
&card, model.card(),
inner_engine, inner_engine,
) )
.await?; .await?;
manager.add_completions_model(&service_name, cmpl_pipeline)?; manager.add_completions_model(model.service_name(), cmpl_pipeline)?;
} }
EngineConfig::None => unreachable!(), EngineConfig::None => unreachable!(),
} }
......
...@@ -15,17 +15,17 @@ ...@@ -15,17 +15,17 @@
#[cfg(any(feature = "vllm", feature = "sglang"))] #[cfg(any(feature = "vllm", feature = "sglang"))]
use std::{future::Future, pin::Pin}; use std::{future::Future, pin::Pin};
use std::{io::Read, path::PathBuf, sync::Arc}; use std::{io::Read, sync::Arc};
use anyhow::Context;
use dynamo_llm::{ use dynamo_llm::{
backend::ExecutionContext, engines::StreamingEngine, kv_router::publisher::KvMetricsPublisher, backend::ExecutionContext, engines::StreamingEngine, kv_router::publisher::KvMetricsPublisher,
model_card::model::ModelDeploymentCard, LocalModel,
}; };
use dynamo_runtime::{protocols::Endpoint, DistributedRuntime}; use dynamo_runtime::{protocols::Endpoint, DistributedRuntime};
mod flags; mod flags;
pub use flags::Flags; pub use flags::Flags;
mod hub;
mod input; mod input;
#[cfg(any(feature = "vllm", feature = "sglang"))] #[cfg(any(feature = "vllm", feature = "sglang"))]
mod net; mod net;
...@@ -53,25 +53,20 @@ const PYTHON_STR_SCHEME: &str = "pystr:"; ...@@ -53,25 +53,20 @@ const PYTHON_STR_SCHEME: &str = "pystr:";
#[cfg(feature = "python")] #[cfg(feature = "python")]
const PYTHON_TOK_SCHEME: &str = "pytok:"; const PYTHON_TOK_SCHEME: &str = "pytok:";
/// Prefix for Hugging Face model repository
const HF_SCHEME: &str = "hf://";
pub enum EngineConfig { pub enum EngineConfig {
/// An remote networked engine we don't know about yet /// An remote networked engine we don't know about yet
Dynamic(Endpoint), Dynamic(Endpoint),
/// A Full service engine does it's own tokenization and prompt formatting. /// A Full service engine does it's own tokenization and prompt formatting.
StaticFull { StaticFull {
service_name: String,
engine: Arc<dyn StreamingEngine>, engine: Arc<dyn StreamingEngine>,
card: Box<ModelDeploymentCard>, model: Box<LocalModel>,
}, },
/// A core engine expects to be wrapped with pre/post processors that handle tokenization. /// A core engine expects to be wrapped with pre/post processors that handle tokenization.
StaticCore { StaticCore {
service_name: String,
engine: ExecutionContext, engine: ExecutionContext,
card: Box<ModelDeploymentCard>, model: Box<LocalModel>,
}, },
/// vllm multi-node doesn't run an engine on nodes other than 0. 'ray' does all the work. /// vllm multi-node doesn't run an engine on nodes other than 0. 'ray' does all the work.
...@@ -93,104 +88,41 @@ pub async fn run( ...@@ -93,104 +88,41 @@ pub async fn run(
#[allow(unused_variables)] zmq_socket_prefix: Option<String>, #[allow(unused_variables)] zmq_socket_prefix: Option<String>,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let cancel_token = runtime.primary_token(); let cancel_token = runtime.primary_token();
let maybe_path = flags
// Turn relative paths into absolute paths and canonicalize them
let mut model_path = flags
.model_path_pos .model_path_pos
.clone() .clone()
.or(flags.model_path_flag.clone()) .or(flags.model_path_flag.clone());
.and_then(|p| {
// Check for hf:// prefix first
if let Some(hf_path) = p.to_string_lossy().strip_prefix(HF_SCHEME) {
return Some(PathBuf::from(hf_path));
}
if p.exists() {
p.canonicalize().ok()
} else {
Some(p)
}
});
// Serve the model under the name provided, or the name of the GGUF file or HF repo. let local_model: LocalModel = match out_opt {
let mut model_name = flags // If output is an endpoint we are ingress and don't have a local model, but making an
.model_name // empty one cleans up the code.
.clone() Output::Endpoint(_) => Default::default(),
.or_else(|| { _ => {
model_path match &maybe_path {
.as_ref() Some(model_path) => {
.and_then(|p| p.iter().next_back()) let maybe_model_name = if in_opt == Input::Text {
.map(|n| n.to_string_lossy().into_owned())
})
.or_else(|| {
if in_opt == Input::Text {
Some(INVISIBLE_MODEL_NAME.to_string()) Some(INVISIBLE_MODEL_NAME.to_string())
} else { } else {
None flags.model_name.clone()
} };
}); LocalModel::prepare(
model_path.to_str().context("Invalid UTF-8 in model path")?,
// If it's an HF repo download it flags.model_config.as_deref(),
if let Some(inner_model_path) = model_path.as_ref() { maybe_model_name.as_deref(),
if !inner_model_path.exists() && !inner_model_path.is_absolute() { )
model_name = Some(inner_model_path.display().to_string()); .await?
model_path = Some(hub::from_hf(inner_model_path).await?);
}
}
// Load the model deployment card, if any
// Only used by some engines, so without those feature flags it's unused.
#[allow(unused_variables)]
let maybe_card = match (&model_path, &flags.model_config) {
// --model-config takes precedence
(_, Some(model_config)) => {
match ModelDeploymentCard::from_local_path(model_config, model_name.as_deref()).await {
Ok(card) => Some(card),
Err(e) => {
tracing::error!(
"Failed to load model card from --model-config path {}: {e}",
model_config.display(),
);
None
}
}
}
// If --model-path is an HF repo use that
(Some(model_path), _) if model_path.is_dir() => {
match ModelDeploymentCard::from_local_path(model_path, model_name.as_deref()).await {
Ok(card) => Some(card),
Err(e) => {
tracing::error!(
"Failed to load model card from --model-path {}: {e}",
model_path.display(),
);
None
}
}
}
(Some(model_path), _) if model_path.is_file() => {
match ModelDeploymentCard::from_gguf(model_path, model_name.as_deref()).await {
Ok(card) => Some(card),
Err(e) => {
tracing::error!(
"Failed to load model card from GGUF {}: {e}",
model_path.display(),
);
None
} }
None => {
// echo_full engine doesn't need a path
Default::default()
} }
} }
// Otherwise we don't have one, but we only need it if we're tokenizing
_ => {
tracing::debug!(
"No model card path provided (neither --model-config nor --model-path)"
);
None
} }
}; };
let dyn_input = match &in_opt { let dyn_input = match &in_opt {
Input::Endpoint(endpoint_path) => { Input::Endpoint(endpoint_path) => {
if model_path.as_ref().map(|mp| mp.is_file()).unwrap_or(false) if maybe_path.as_ref().map(|mp| mp.is_file()).unwrap_or(false)
&& flags.model_config.is_none() && flags.model_config.is_none()
{ {
// TODO We need to convert tokenizer extract from GGUF file into something we can // TODO We need to convert tokenizer extract from GGUF file into something we can
...@@ -222,60 +154,41 @@ pub async fn run( ...@@ -222,60 +154,41 @@ pub async fn run(
None None
}; };
// We may need it later
let card = local_model.card().clone();
// Create the engine matching `out` // Create the engine matching `out`
let engine_config = match out_opt { let engine_config = match out_opt {
Output::EchoFull => { Output::Endpoint(path) => {
let Some(model_name) = model_name else { let endpoint: Endpoint = path.parse()?;
anyhow::bail!( EngineConfig::Dynamic(endpoint)
"Pass --model-name or --model-path so we know which model to imitate"
);
};
EngineConfig::StaticFull {
card: Box::new(ModelDeploymentCard::with_name_only(&model_name)),
service_name: model_name,
engine: dynamo_llm::engines::make_engine_full(),
}
} }
Output::EchoFull => EngineConfig::StaticFull {
model: Box::new(local_model),
engine: dynamo_llm::engines::make_engine_full(),
},
Output::EchoCore => { Output::EchoCore => {
let Some(mut card) = maybe_card.clone() else { let card = local_model.card();
if !card.has_tokenizer() {
anyhow::bail!( anyhow::bail!(
"out=echo_core need to find the tokenizer. Pass flag --model-path <path>" "out=echo_core need to find the tokenizer. Pass flag --model-path <path>"
); );
}; };
EngineConfig::StaticCore { EngineConfig::StaticCore {
service_name: card.service_name.clone(),
engine: dynamo_llm::engines::make_engine_core(), engine: dynamo_llm::engines::make_engine_core(),
card: Box::new(card), model: Box::new(local_model),
} }
} }
Output::Endpoint(path) => {
let endpoint: Endpoint = path.parse()?;
EngineConfig::Dynamic(endpoint)
}
#[cfg(feature = "mistralrs")] #[cfg(feature = "mistralrs")]
Output::MistralRs => { Output::MistralRs => EngineConfig::StaticFull {
let Some(model_path) = model_path else { engine: dynamo_engine_mistralrs::make_engine(local_model.path()).await?,
anyhow::bail!("out=mistralrs requires flag --model-path=<full-path-to-model-gguf>"); model: Box::new(local_model),
}; },
let Some(model_name) = model_name else {
unreachable!("We checked model_path earlier, and set model_name from model_path");
};
EngineConfig::StaticFull {
card: Box::new(ModelDeploymentCard::with_name_only(&model_name)),
service_name: model_name,
engine: dynamo_engine_mistralrs::make_engine(&model_path).await?,
}
}
#[cfg(feature = "sglang")] #[cfg(feature = "sglang")]
Output::SgLang => { Output::SgLang => {
let Some(model_path) = model_path else { if !local_model.path().is_dir() {
anyhow::bail!("out=sglang requires flag --model-path=<full-path-to-model-dir>");
};
if !model_path.is_dir() {
anyhow::bail!("`--model-path should point at a HuggingFace repo checkout"); anyhow::bail!("`--model-path should point at a HuggingFace repo checkout");
} }
// Safety: Earlier we build maybe_card from model_path, which we checked right above
let card = maybe_card.clone().unwrap();
let Some(sock_prefix) = zmq_socket_prefix else { let Some(sock_prefix) = zmq_socket_prefix else {
anyhow::bail!("sglang requires zmq_socket_prefix"); anyhow::bail!("sglang requires zmq_socket_prefix");
}; };
...@@ -299,7 +212,7 @@ pub async fn run( ...@@ -299,7 +212,7 @@ pub async fn run(
let (engine, sglang_process) = dynamo_engine_sglang::make_engine( let (engine, sglang_process) = dynamo_engine_sglang::make_engine(
cancel_token.clone(), cancel_token.clone(),
&model_path, local_model.path(),
&sock_prefix, &sock_prefix,
node_conf, node_conf,
flags.tensor_parallel_size, flags.tensor_parallel_size,
...@@ -311,9 +224,8 @@ pub async fn run( ...@@ -311,9 +224,8 @@ pub async fn run(
let _ = sglang_process.await; let _ = sglang_process.await;
})); }));
EngineConfig::StaticCore { EngineConfig::StaticCore {
service_name: card.service_name.clone(),
engine, engine,
card: Box::new(card), model: Box::new(local_model),
} }
} }
#[cfg(feature = "vllm")] #[cfg(feature = "vllm")]
...@@ -321,16 +233,6 @@ pub async fn run( ...@@ -321,16 +233,6 @@ pub async fn run(
if flags.base_gpu_id != 0 { if flags.base_gpu_id != 0 {
anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead."); anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
} }
let Some(model_path) = model_path else {
anyhow::bail!(
"out=vllm requires flag --model-path=<full-path-to-hf-repo-or-model-gguf>"
);
};
let Some(card) = maybe_card.clone() else {
anyhow::bail!(
"Unable to build tokenizer. out=vllm requires --model-path to be an HF repo with fast tokenizer (tokenizer.json) or a GGUF file"
);
};
let Some(sock_prefix) = zmq_socket_prefix else { let Some(sock_prefix) = zmq_socket_prefix else {
anyhow::bail!("vllm requires zmq_socket_prefix"); anyhow::bail!("vllm requires zmq_socket_prefix");
}; };
...@@ -368,7 +270,7 @@ pub async fn run( ...@@ -368,7 +270,7 @@ pub async fn run(
// vllm multi-node only the leader runs vllm // vllm multi-node only the leader runs vllm
let (engine, vllm_future) = dynamo_engine_vllm0_7::make_leader_engine( let (engine, vllm_future) = dynamo_engine_vllm0_7::make_leader_engine(
cancel_token.clone(), cancel_token.clone(),
&model_path, local_model.path(),
&sock_prefix, &sock_prefix,
node_conf, node_conf,
flags.tensor_parallel_size, flags.tensor_parallel_size,
...@@ -380,9 +282,8 @@ pub async fn run( ...@@ -380,9 +282,8 @@ pub async fn run(
let _ = vllm_future.await; let _ = vllm_future.await;
})); }));
EngineConfig::StaticCore { EngineConfig::StaticCore {
service_name: card.service_name.clone(),
engine, engine,
card: Box::new(card), model: Box::new(local_model),
} }
} else { } else {
// Nodes rank > 0 only run 'ray' // Nodes rank > 0 only run 'ray'
...@@ -398,16 +299,6 @@ pub async fn run( ...@@ -398,16 +299,6 @@ pub async fn run(
if flags.base_gpu_id != 0 { if flags.base_gpu_id != 0 {
anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead."); anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
} }
let Some(model_path) = model_path else {
anyhow::bail!(
"out=vllm requires flag --model-path=<full-path-to-hf-repo-or-model-gguf>"
);
};
let Some(card) = maybe_card.clone() else {
anyhow::bail!(
"Unable to build tokenizer. out=vllm requires --model-path to be an HF repo with fast tokenizer (tokenizer.json) or a GGUF file"
);
};
let node_conf = dynamo_llm::engines::MultiNodeConfig { let node_conf = dynamo_llm::engines::MultiNodeConfig {
num_nodes: flags.num_nodes, num_nodes: flags.num_nodes,
node_rank: flags.node_rank, node_rank: flags.node_rank,
...@@ -415,71 +306,53 @@ pub async fn run( ...@@ -415,71 +306,53 @@ pub async fn run(
}; };
let engine = dynamo_engine_vllm0_8::make_engine( let engine = dynamo_engine_vllm0_8::make_engine(
cancel_token.clone(), cancel_token.clone(),
&model_path, local_model.path(),
node_conf, node_conf,
flags.tensor_parallel_size, flags.tensor_parallel_size,
flags.extra_engine_args.clone(), flags.extra_engine_args.clone(),
) )
.await?; .await?;
EngineConfig::StaticCore { EngineConfig::StaticCore {
service_name: card.service_name.clone(),
engine, engine,
card: Box::new(card), model: Box::new(local_model),
} }
} }
#[cfg(feature = "llamacpp")] #[cfg(feature = "llamacpp")]
Output::LlamaCpp => { Output::LlamaCpp => {
let Some(model_path) = model_path else { if !local_model.path().is_file() {
anyhow::bail!("out=llamacpp requires flag --model-path=<full-path-to-model-gguf>");
};
if !model_path.is_file() {
anyhow::bail!("--model-path should refer to a GGUF file. llama_cpp does not support safetensors."); anyhow::bail!("--model-path should refer to a GGUF file. llama_cpp does not support safetensors.");
} }
let Some(card) = maybe_card.clone() else {
anyhow::bail!(
"Pass --model-config so we can find the tokenizer, should be an HF checkout."
);
};
let engine = let engine =
dynamo_engine_llamacpp::make_engine(cancel_token.clone(), &model_path).await?; dynamo_engine_llamacpp::make_engine(cancel_token.clone(), local_model.path())
.await?;
EngineConfig::StaticCore { EngineConfig::StaticCore {
service_name: card.service_name.clone(),
engine, engine,
card: Box::new(card), model: Box::new(local_model),
} }
} }
#[cfg(feature = "python")] #[cfg(feature = "python")]
Output::PythonStr(path_str) => { Output::PythonStr(path_str) => {
let Some(model_name) = &model_name else { let card = local_model.card();
anyhow::bail!("Provide model service name as `--model-name <this>`"); let py_args = flags.as_vec(&path_str, &card.service_name);
};
let py_args = flags.as_vec(&path_str, model_name);
let p = std::path::PathBuf::from(path_str); let p = std::path::PathBuf::from(path_str);
let engine = let engine =
dynamo_engine_python::make_string_engine(cancel_token.clone(), &p, py_args).await?; dynamo_engine_python::make_string_engine(cancel_token.clone(), &p, py_args).await?;
EngineConfig::StaticFull { EngineConfig::StaticFull {
service_name: model_name.to_string(),
engine, engine,
card: Box::new(ModelDeploymentCard::with_name_only(model_name)), model: Box::new(local_model),
} }
} }
#[cfg(feature = "python")] #[cfg(feature = "python")]
Output::PythonTok(path_str) => { Output::PythonTok(path_str) => {
let Some(card) = maybe_card.clone() else { let card = local_model.card();
anyhow::bail!("Could not find tokenizer. Pass flag --model-path <path>"); let py_args = flags.as_vec(&path_str, &card.service_name);
};
let Some(model_name) = model_name else {
unreachable!("If we have a card we must have a model name");
};
let py_args = flags.as_vec(&path_str, &model_name);
let p = std::path::PathBuf::from(path_str); let p = std::path::PathBuf::from(path_str);
let engine = let engine =
dynamo_engine_python::make_token_engine(cancel_token.clone(), &p, py_args).await?; dynamo_engine_python::make_token_engine(cancel_token.clone(), &p, py_args).await?;
EngineConfig::StaticCore { EngineConfig::StaticCore {
service_name: model_name.clone(),
engine, engine,
card: Box::new(card), model: Box::new(local_model),
} }
} }
}; };
...@@ -504,14 +377,7 @@ pub async fn run( ...@@ -504,14 +377,7 @@ pub async fn run(
.await?; .await?;
} }
Input::Batch(path) => { Input::Batch(path) => {
crate::input::batch::run( crate::input::batch::run(runtime.clone(), flags, card, path, engine_config, template)
runtime.clone(),
flags,
maybe_card,
path,
engine_config,
template,
)
.await?; .await?;
} }
Input::Endpoint(path) => { Input::Endpoint(path) => {
......
...@@ -1053,6 +1053,7 @@ dependencies = [ ...@@ -1053,6 +1053,7 @@ dependencies = [
"futures", "futures",
"galil-seiferas", "galil-seiferas",
"ggus", "ggus",
"hf-hub",
"itertools 0.14.0", "itertools 0.14.0",
"memmap2", "memmap2",
"minijinja", "minijinja",
...@@ -1195,6 +1196,15 @@ version = "1.0.0" ...@@ -1195,6 +1196,15 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
[[package]]
name = "encoding_rs"
version = "0.8.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
dependencies = [
"cfg-if 1.0.0",
]
[[package]] [[package]]
name = "enum-as-inner" name = "enum-as-inner"
version = "0.6.1" version = "0.6.1"
...@@ -1870,15 +1880,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" ...@@ -1870,15 +1880,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cc03dcb0b0a83ae3f3363ec811014ae669f083e4e499c66602f447c4828737a1" checksum = "cc03dcb0b0a83ae3f3363ec811014ae669f083e4e499c66602f447c4828737a1"
dependencies = [ dependencies = [
"dirs", "dirs",
"futures",
"http", "http",
"indicatif", "indicatif",
"libc", "libc",
"log", "log",
"num_cpus",
"rand 0.8.5", "rand 0.8.5",
"reqwest", "reqwest",
"serde", "serde",
"serde_json", "serde_json",
"thiserror 2.0.12", "thiserror 2.0.12",
"tokio",
"ureq", "ureq",
"windows-sys 0.59.0", "windows-sys 0.59.0",
] ]
...@@ -3580,8 +3593,10 @@ checksum = "d19c46a6fdd48bc4dab94b6103fccc55d34c67cc0ad04653aad4ea2a07cd7bbb" ...@@ -3580,8 +3593,10 @@ checksum = "d19c46a6fdd48bc4dab94b6103fccc55d34c67cc0ad04653aad4ea2a07cd7bbb"
dependencies = [ dependencies = [
"base64 0.22.1", "base64 0.22.1",
"bytes", "bytes",
"encoding_rs",
"futures-core", "futures-core",
"futures-util", "futures-util",
"h2",
"http", "http",
"http-body", "http-body",
"http-body-util", "http-body-util",
...@@ -3605,6 +3620,7 @@ dependencies = [ ...@@ -3605,6 +3620,7 @@ dependencies = [
"serde_json", "serde_json",
"serde_urlencoded", "serde_urlencoded",
"sync_wrapper", "sync_wrapper",
"system-configuration",
"tokio", "tokio",
"tokio-rustls", "tokio-rustls",
"tokio-util", "tokio-util",
...@@ -4180,6 +4196,27 @@ dependencies = [ ...@@ -4180,6 +4196,27 @@ dependencies = [
"walkdir", "walkdir",
] ]
[[package]]
name = "system-configuration"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b"
dependencies = [
"bitflags 2.9.0",
"core-foundation 0.9.4",
"system-configuration-sys",
]
[[package]]
name = "system-configuration-sys"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4"
dependencies = [
"core-foundation-sys",
"libc",
]
[[package]] [[package]]
name = "system-deps" name = "system-deps"
version = "6.2.2" version = "6.2.2"
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# A very basic example of vllm worker handling pre-processed requests.
#
# Dynamo does the HTTP handling, prompt templating and tokenization, then forwards the
# request via NATS to this python script, which runs vllm.
#
# Setup a virtualenv with dynamo.llm, dynamo.runtime and vllm installed
# in lib/bindings/python `maturin develop` and `pip install -e .` should do it
# Start nats and etcd:
# - nats-server -js
#
# Window 1: `python server_vllm.py`. Wait for log "Starting endpoint".
# Window 2: `dynamo-run out=dyn://dynamo.backend.generate`
import argparse
import asyncio
import sys
import uvloop
from vllm import SamplingParams
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args,
)
from vllm.inputs import TokensPrompt
from dynamo.llm import ModelType, register_llm
from dynamo.runtime import DistributedRuntime, dynamo_worker
DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate"
DEFAULT_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
class Config:
"""Command line parameters or defaults"""
namespace: str
component: str
endpoint: str
model: str
class RequestHandler:
"""
Request handler for the generate endpoint
"""
def __init__(self, engine):
self.engine_client = engine
async def generate(self, request):
request_id = "1" # hello_world example only
# print(f"Received request: {request}")
prompt = TokensPrompt(prompt_token_ids=request["token_ids"])
sampling_params = SamplingParams(
temperature=request["sampling_options"]["temperature"],
# vllm defaults this to 16
max_tokens=request["stop_conditions"]["max_tokens"],
)
num_output_tokens_so_far = 0
gen = self.engine_client.generate(prompt, sampling_params, request_id)
async for res in gen:
# res is vllm's RequestOutput
# This is the expected way for a request to end.
# The new token ID will be eos, don't forward it.
if res.finished:
yield {"finish_reason": "stop", "token_ids": []}
break
if not res.outputs:
yield {"finish_reason": "error", "token_ids": []}
break
output = res.outputs[0]
next_total_toks = len(output.token_ids)
out = {"token_ids": output.token_ids[num_output_tokens_so_far:]}
if output.finish_reason:
out["finish_reason"] = output.finish_reason
if output.stop_reason:
out["stop_reason"] = output.stop_reason
yield out
num_output_tokens_so_far = next_total_toks
@dynamo_worker(static=False)
async def worker(runtime: DistributedRuntime):
await init(runtime, cmd_line_args())
async def init(runtime: DistributedRuntime, config: Config):
"""
Instantiate and serve
"""
component = runtime.namespace(config.namespace).component(config.component)
await component.create_service()
endpoint = component.endpoint(config.endpoint)
print("Started server instance")
await register_llm(endpoint, config.model, ModelType.Backend)
engine_args = AsyncEngineArgs(
model=config.model,
task="generate",
skip_tokenizer_init=True,
)
engine_context = build_async_engine_client_from_engine_args(engine_args)
engine_client = await engine_context.__aenter__()
# the server will gracefully shutdown (i.e., keep opened TCP streams finishes)
# after the lease is revoked
await endpoint.serve_endpoint(RequestHandler(engine_client).generate, None)
def cmd_line_args():
parser = argparse.ArgumentParser(
description="vLLM server integrated with Dynamo runtime."
)
parser.add_argument(
"--endpoint",
type=str,
default=DEFAULT_ENDPOINT,
help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
)
parser.add_argument(
"--model",
type=str,
default=DEFAULT_MODEL,
help=f"Path to disk model or HuggingFace model identifier to load. Default: {DEFAULT_MODEL}",
)
args = parser.parse_args()
config = Config()
config.model = args.model
endpoint_str = args.endpoint
if endpoint_str.startswith("dyn://"):
endpoint_str = endpoint_str[len("dyn://") :]
endpoint_parts = endpoint_str.split(".")
if len(endpoint_parts) != 3:
print(
f"Invalid endpoint format: '{args.endpoint}'. Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'."
)
sys.exit(1)
parsed_namespace, parsed_component_name, parsed_endpoint_name = endpoint_parts
config.namespace = parsed_namespace
config.component = parsed_component_name
config.endpoint = parsed_endpoint_name
return config
if __name__ == "__main__":
uvloop.install()
asyncio.run(worker())
...@@ -51,6 +51,7 @@ const DEFAULT_ANNOTATED_SETTING: Option<bool> = Some(true); ...@@ -51,6 +51,7 @@ const DEFAULT_ANNOTATED_SETTING: Option<bool> = Some(true);
fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> { fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
logging::init(); logging::init();
m.add_function(wrap_pyfunction!(log_message, m)?)?; m.add_function(wrap_pyfunction!(log_message, m)?)?;
m.add_function(wrap_pyfunction!(register_llm, m)?)?;
m.add_class::<DistributedRuntime>()?; m.add_class::<DistributedRuntime>()?;
m.add_class::<CancellationToken>()?; m.add_class::<CancellationToken>()?;
...@@ -77,6 +78,7 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> { ...@@ -77,6 +78,7 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::<http::HttpError>()?; m.add_class::<http::HttpError>()?;
m.add_class::<http::HttpAsyncEngine>()?; m.add_class::<http::HttpAsyncEngine>()?;
m.add_class::<EtcdKvCache>()?; m.add_class::<EtcdKvCache>()?;
m.add_class::<ModelType>()?;
engine::add_to_module(m)?; engine::add_to_module(m)?;
...@@ -97,6 +99,37 @@ fn log_message(level: &str, message: &str, module: &str, file: &str, line: u32) ...@@ -97,6 +99,37 @@ fn log_message(level: &str, message: &str, module: &str, file: &str, line: u32)
logging::log_message(level, message, module, file, line); logging::log_message(level, message, module, file, line);
} }
#[pyfunction]
#[pyo3(text_signature = "(endpoint, path, model_type)")]
fn register_llm<'p>(
py: Python<'p>,
endpoint: Endpoint,
path: &str,
model_type: ModelType,
) -> PyResult<Bound<'p, PyAny>> {
let model_type_obj = match model_type {
ModelType::Chat => llm_rs::model_type::ModelType::Chat,
ModelType::Completion => llm_rs::model_type::ModelType::Completion,
ModelType::Backend => llm_rs::model_type::ModelType::Backend,
};
let inner_path = path.to_string();
pyo3_async_runtimes::tokio::future_into_py(py, async move {
// Download from HF, load the ModelDeploymentCard
let mut local_model = llm_rs::LocalModel::prepare(&inner_path, None, None)
.await
.map_err(to_pyerr)?;
// Advertise ourself on etcd so ingress can find us
local_model
.attach(&endpoint.inner, model_type_obj)
.await
.map_err(to_pyerr)?;
Ok(())
})
}
#[pyclass] #[pyclass]
#[derive(Clone)] #[derive(Clone)]
struct EtcdKvCache { struct EtcdKvCache {
...@@ -155,6 +188,15 @@ struct PyLease { ...@@ -155,6 +188,15 @@ struct PyLease {
inner: rs::transports::etcd::Lease, inner: rs::transports::etcd::Lease,
} }
#[pyclass(eq, eq_int)]
#[derive(Clone, PartialEq)]
#[repr(i32)]
enum ModelType {
Chat = 1,
Completion = 2,
Backend = 3,
}
#[pymethods] #[pymethods]
impl PyLease { impl PyLease {
fn id(&self) -> i64 { fn id(&self) -> i64 {
......
...@@ -26,16 +26,12 @@ impl ModelDeploymentCard {} ...@@ -26,16 +26,12 @@ impl ModelDeploymentCard {}
#[pymethods] #[pymethods]
impl ModelDeploymentCard { impl ModelDeploymentCard {
// Previously called "from_local_path"
#[staticmethod] #[staticmethod]
fn from_local_path( fn load(path: String, model_name: String, py: Python<'_>) -> PyResult<Bound<'_, PyAny>> {
path: String,
model_name: String,
py: Python<'_>,
) -> PyResult<Bound<'_, PyAny>> {
pyo3_async_runtimes::tokio::future_into_py(py, async move { pyo3_async_runtimes::tokio::future_into_py(py, async move {
let card = RsModelDeploymentCard::from_local_path(&path, Some(&model_name)) let mut card = RsModelDeploymentCard::load(&path).await.map_err(to_pyerr)?;
.await card.set_name(&model_name);
.map_err(to_pyerr)?;
Ok(ModelDeploymentCard { inner: card }) Ok(ModelDeploymentCard { inner: card })
}) })
} }
......
...@@ -605,3 +605,12 @@ class HttpAsyncEngine: ...@@ -605,3 +605,12 @@ class HttpAsyncEngine:
""" """
... ...
class ModelType:
"""What type of request this model needs: Chat, Component or Backend (pre-processed)"""
...
async def register_llm(endpoint: Endpoint, path: str, model_type: ModelType) -> None:
"""Attach the model at path to the given endpoint, and advertise it as model_type"""
...
...@@ -24,4 +24,6 @@ from dynamo._core import KvMetricsAggregator as KvMetricsAggregator ...@@ -24,4 +24,6 @@ from dynamo._core import KvMetricsAggregator as KvMetricsAggregator
from dynamo._core import KvMetricsPublisher as KvMetricsPublisher from dynamo._core import KvMetricsPublisher as KvMetricsPublisher
from dynamo._core import KvRecorder as KvRecorder from dynamo._core import KvRecorder as KvRecorder
from dynamo._core import KvRouter as KvRouter from dynamo._core import KvRouter as KvRouter
from dynamo._core import ModelType as ModelType
from dynamo._core import OverlapScores as OverlapScores from dynamo._core import OverlapScores as OverlapScores
from dynamo._core import register_llm as register_llm
...@@ -46,6 +46,7 @@ derive_builder = {workspace = true } ...@@ -46,6 +46,7 @@ derive_builder = {workspace = true }
either = { workspace = true } either = { workspace = true }
etcd-client = { workspace = true } etcd-client = { workspace = true }
futures = { workspace = true } futures = { workspace = true }
hf-hub = { workspace = true }
rand = { workspace = true } rand = { workspace = true }
prometheus = { workspace = true } prometheus = { workspace = true }
serde = { workspace = true } serde = { workspace = true }
......
...@@ -20,7 +20,8 @@ const IGNORED: [&str; 3] = [".gitattributes", "LICENSE", "README.md"]; ...@@ -20,7 +20,8 @@ const IGNORED: [&str; 3] = [".gitattributes", "LICENSE", "README.md"];
/// Attempt to download a model from Hugging Face /// Attempt to download a model from Hugging Face
/// Returns the directory it is in /// Returns the directory it is in
pub async fn from_hf(name: &Path) -> anyhow::Result<PathBuf> { pub async fn from_hf(name: impl AsRef<Path>) -> anyhow::Result<PathBuf> {
let name = name.as_ref();
let api = ApiBuilder::new().with_progress(true).build()?; let api = ApiBuilder::new().with_progress(true).build()?;
let model_name = name.display().to_string(); let model_name = name.display().to_string();
......
...@@ -24,8 +24,10 @@ pub mod disagg_router; ...@@ -24,8 +24,10 @@ pub mod disagg_router;
pub mod engines; pub mod engines;
pub mod gguf; pub mod gguf;
pub mod http; pub mod http;
pub mod hub;
pub mod key_value_store; pub mod key_value_store;
pub mod kv_router; pub mod kv_router;
mod local_model;
pub mod model_card; pub mod model_card;
pub mod model_type; pub mod model_type;
pub mod preprocessor; pub mod preprocessor;
...@@ -36,5 +38,7 @@ pub mod tokenizers; ...@@ -36,5 +38,7 @@ pub mod tokenizers;
pub mod tokens; pub mod tokens;
pub mod types; pub mod types;
pub use local_model::LocalModel;
#[cfg(feature = "cuda_kv")] #[cfg(feature = "cuda_kv")]
pub mod kv; pub mod kv;
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::fs;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use anyhow::Context;
use dynamo_runtime::component::Endpoint;
use dynamo_runtime::traits::DistributedRuntimeProvider;
use crate::http::service::discovery::{ModelEntry, ModelNetworkName};
use crate::key_value_store::{EtcdStorage, KeyValueStore, KeyValueStoreManager};
use crate::model_card::{self, ModelDeploymentCard};
use crate::model_type::ModelType;
/// Prefix for Hugging Face model repository
const HF_SCHEME: &str = "hf://";
/// What we call a model if the user didn't provide a name. Usually this means the name
/// is invisible, for example in a text chat.
const DEFAULT_NAME: &str = "dynamo";
#[derive(Debug, Clone)]
pub struct LocalModel {
full_path: PathBuf,
card: ModelDeploymentCard,
}
impl Default for LocalModel {
fn default() -> Self {
LocalModel {
full_path: PathBuf::new(),
card: ModelDeploymentCard::with_name_only(DEFAULT_NAME),
}
}
}
impl LocalModel {
pub fn card(&self) -> &ModelDeploymentCard {
&self.card
}
pub fn path(&self) -> &Path {
&self.full_path
}
pub fn service_name(&self) -> &str {
&self.card.service_name
}
/// Make an LLM ready for use:
/// - Download it from Hugging Face (and NGC in future) if necessary
/// - Resolve the path
/// - Load it's ModelDeploymentCard card
/// - Name it correctly
///
/// The model name will depend on what "model_path" is:
/// - A folder: The last part of the folder name: "/data/llms/Qwen2.5-3B-Instruct" -> "Qwen2.5-3B-Instruct"
/// - A file: The GGUF filename: "/data/llms/Qwen2.5-3B-Instruct-Q6_K.gguf" -> "Qwen2.5-3B-Instruct-Q6_K.gguf"
/// - An HF repo: The HF repo name: "Qwen/Qwen2.5-3B-Instruct" stays the same
pub async fn prepare(
model_path: &str,
override_config: Option<&Path>,
override_name: Option<&str>,
) -> anyhow::Result<LocalModel> {
// Name it
// Check for hf:// prefix first, in case we really want an HF repo but it conflicts
// with a relative path.
let is_hf_repo =
model_path.starts_with(HF_SCHEME) || !fs::exists(model_path).unwrap_or(false);
let relative_path = model_path.trim_start_matches(HF_SCHEME);
let full_path = if is_hf_repo {
// HF download if necessary
super::hub::from_hf(relative_path).await?
} else {
fs::canonicalize(relative_path)?
};
let model_name = match override_name.map(|s| s.to_string()) {
Some(name) => name,
None => {
if is_hf_repo {
// HF repos use their full name ("org/name") not the folder name
relative_path.to_string()
} else {
full_path
.iter()
.next_back()
.map(|n| n.to_string_lossy().into_owned())
.with_context(|| {
format!("Invalid model path, too short: {}", full_path.display())
})?
}
}
};
// Load the ModelDeploymentCard
// --model-config takes precedence over --model-path
let model_config_path = override_config.unwrap_or(&full_path);
let mut card = ModelDeploymentCard::load(&model_config_path).await?;
card.set_name(&model_name);
Ok(LocalModel { full_path, card })
}
/// Attach this model the endpoint. This registers it on the network
/// allowing ingress to discover it.
pub async fn attach(
&mut self,
endpoint: &Endpoint,
model_type: ModelType,
) -> anyhow::Result<()> {
// A static component doesn't have an etcd_client because it doesn't need to register
let Some(etcd_client) = endpoint.drt().etcd_client() else {
anyhow::bail!("Cannot attach to static endpoint");
};
// Store model config files in NATS object store
let nats_client = endpoint.drt().nats_client();
self.card.move_to_nats(nats_client.clone()).await?;
// Publish the Model Deployment Card to etcd
let endpoint_id = endpoint.id();
let kvstore: Box<dyn KeyValueStore> =
Box::new(EtcdStorage::new(etcd_client.clone(), endpoint_id.clone()));
let card_store = Arc::new(KeyValueStoreManager::new(kvstore));
let key = self.card.slug().to_string();
card_store
.publish(model_card::BUCKET_NAME, None, &key, &mut self.card)
.await?;
// Publish our ModelEntry to etcd. This allows ingress to find the model card.
// (Why don't we put the model card directly under this key?)
let network_name = ModelNetworkName::from_local(endpoint, etcd_client.lease_id());
tracing::debug!("Registering with etcd as {network_name}");
let model_registration = ModelEntry {
name: self.service_name().to_string(),
endpoint: endpoint_id.clone(),
model_type,
};
etcd_client
.kv_create(
network_name.to_string(),
serde_json::to_vec_pretty(&model_registration)?,
None, // use primary lease
)
.await
}
}
...@@ -23,6 +23,25 @@ use std::path::Path; ...@@ -23,6 +23,25 @@ use std::path::Path;
use crate::model_card::model::{ModelInfoType, PromptFormatterArtifact, TokenizerKind}; use crate::model_card::model::{ModelInfoType, PromptFormatterArtifact, TokenizerKind};
impl ModelDeploymentCard { impl ModelDeploymentCard {
/// Allow user to override the name we register this model under.
/// Corresponds to vllm's `--served-model-name`.
pub fn set_name(&mut self, name: &str) {
self.display_name = name.to_string();
self.service_name = name.to_string();
}
/// Build an in-memory ModelDeploymentCard from either:
/// - a folder containing config.json, tokenizer.json and token_config.json
/// - a GGUF file
pub async fn load(config_path: impl AsRef<Path>) -> anyhow::Result<ModelDeploymentCard> {
let config_path = config_path.as_ref();
if config_path.is_dir() {
Self::from_local_path(config_path).await
} else {
Self::from_gguf(config_path).await
}
}
/// Creates a ModelDeploymentCard from a local directory path. /// Creates a ModelDeploymentCard from a local directory path.
/// ///
/// Currently HuggingFace format is supported and following files are expected: /// Currently HuggingFace format is supported and following files are expected:
...@@ -38,10 +57,7 @@ impl ModelDeploymentCard { ...@@ -38,10 +57,7 @@ impl ModelDeploymentCard {
/// - The path doesn't exist or isn't a directory /// - The path doesn't exist or isn't a directory
/// - The path contains invalid Unicode characters /// - The path contains invalid Unicode characters
/// - Required model files are missing or invalid /// - Required model files are missing or invalid
pub async fn from_local_path( async fn from_local_path(local_root_dir: impl AsRef<Path>) -> anyhow::Result<Self> {
local_root_dir: impl AsRef<Path>,
model_name: Option<&str>,
) -> anyhow::Result<Self> {
let local_root_dir = local_root_dir.as_ref(); let local_root_dir = local_root_dir.as_ref();
check_valid_local_repo_path(local_root_dir)?; check_valid_local_repo_path(local_root_dir)?;
let repo_id = local_root_dir let repo_id = local_root_dir
...@@ -49,22 +65,18 @@ impl ModelDeploymentCard { ...@@ -49,22 +65,18 @@ impl ModelDeploymentCard {
.to_str() .to_str()
.ok_or_else(|| anyhow::anyhow!("Path contains invalid Unicode"))? .ok_or_else(|| anyhow::anyhow!("Path contains invalid Unicode"))?
.to_string(); .to_string();
let model_name = model_name.unwrap_or( let model_name = local_root_dir
local_root_dir
.file_name() .file_name()
.and_then(|n| n.to_str()) .and_then(|n| n.to_str())
.ok_or_else(|| anyhow::anyhow!("Invalid model directory name"))?, .ok_or_else(|| anyhow::anyhow!("Invalid model directory name"))?;
);
Self::from_repo(&repo_id, model_name).await Self::from_repo(&repo_id, model_name).await
} }
pub async fn from_gguf(gguf_file: &Path, model_name: Option<&str>) -> anyhow::Result<Self> { async fn from_gguf(gguf_file: &Path) -> anyhow::Result<Self> {
let model_name = model_name.map(|s| s.to_string()).or_else(|| { let model_name = gguf_file
gguf_file
.iter() .iter()
.next_back() .next_back()
.map(|n| n.to_string_lossy().to_string()) .map(|n| n.to_string_lossy().to_string());
});
let Some(model_name) = model_name else { let Some(model_name) = model_name else {
// I think this would only happy on an empty path // I think this would only happy on an empty path
anyhow::bail!( anyhow::bail!(
...@@ -81,19 +93,19 @@ impl ModelDeploymentCard { ...@@ -81,19 +93,19 @@ impl ModelDeploymentCard {
prompt_context: None, // TODO - auto-detect prompt context prompt_context: None, // TODO - auto-detect prompt context
revision: 0, revision: 0,
last_published: None, last_published: None,
requires_preprocessing: false,
}) })
} }
/// TODO: This will be implemented after nova-hub is integrated with the model-card /// TODO: This will be implemented after nova-hub is integrated with the model-card
/// TODO: Attempt to auto-detect model type and construct an MDC from a NGC repo /// TODO: Attempt to auto-detect model type and construct an MDC from a NGC repo
pub async fn from_ngc_repo(_: &str) -> anyhow::Result<Self> { #[allow(dead_code)]
async fn from_ngc_repo(_: &str) -> anyhow::Result<Self> {
Err(anyhow::anyhow!( Err(anyhow::anyhow!(
"ModelDeploymentCard::from_ngc_repo is not implemented" "ModelDeploymentCard::from_ngc_repo is not implemented"
)) ))
} }
pub async fn from_repo(repo_id: &str, model_name: &str) -> anyhow::Result<Self> { async fn from_repo(repo_id: &str, model_name: &str) -> anyhow::Result<Self> {
Ok(Self { Ok(Self {
display_name: model_name.to_string(), display_name: model_name.to_string(),
service_name: model_name.to_string(), service_name: model_name.to_string(),
...@@ -103,7 +115,6 @@ impl ModelDeploymentCard { ...@@ -103,7 +115,6 @@ impl ModelDeploymentCard {
prompt_context: None, // TODO - auto-detect prompt context prompt_context: None, // TODO - auto-detect prompt context
revision: 0, revision: 0,
last_published: None, last_published: None,
requires_preprocessing: false,
}) })
} }
} }
......
...@@ -125,12 +125,6 @@ pub struct ModelDeploymentCard { ...@@ -125,12 +125,6 @@ pub struct ModelDeploymentCard {
/// Incrementing count of how many times we published this card /// Incrementing count of how many times we published this card
#[serde(default, skip_serializing)] #[serde(default, skip_serializing)]
pub revision: u64, pub revision: u64,
/// Does this model expect preprocessing (tokenization, etc) to be already done?
/// If this is true they get a BackendInput JSON. If this is false they get
/// an NvCreateChatCompletionRequest JSON.
#[serde(default)]
pub requires_preprocessing: bool,
} }
impl ModelDeploymentCard { impl ModelDeploymentCard {
...@@ -171,9 +165,7 @@ impl ModelDeploymentCard { ...@@ -171,9 +165,7 @@ impl ModelDeploymentCard {
/// Load a model deployment card from a JSON file /// Load a model deployment card from a JSON file
pub fn load_from_json_file<P: AsRef<Path>>(file: P) -> std::io::Result<Self> { pub fn load_from_json_file<P: AsRef<Path>>(file: P) -> std::io::Result<Self> {
let mut card: ModelDeploymentCard = serde_json::from_str(&std::fs::read_to_string(file)?)?; Ok(serde_json::from_str(&std::fs::read_to_string(file)?)?)
card.requires_preprocessing = false;
Ok(card)
} }
/// Load a model deployment card from a JSON string /// Load a model deployment card from a JSON string
...@@ -218,6 +210,12 @@ impl ModelDeploymentCard { ...@@ -218,6 +210,12 @@ impl ModelDeploymentCard {
} }
} }
/// Is this a full model card with tokenizer?
/// There are cases where we have a placeholder card (see `with_name_only`).
pub fn has_tokenizer(&self) -> bool {
self.tokenizer.is_some()
}
pub fn tokenizer_hf(&self) -> anyhow::Result<HfTokenizer> { pub fn tokenizer_hf(&self) -> anyhow::Result<HfTokenizer> {
match &self.tokenizer { match &self.tokenizer {
Some(TokenizerKind::HfTokenizerJson(file)) => { Some(TokenizerKind::HfTokenizerJson(file)) => {
......
...@@ -18,7 +18,7 @@ use dynamo_llm::model_card::model::ModelDeploymentCard; ...@@ -18,7 +18,7 @@ use dynamo_llm::model_card::model::ModelDeploymentCard;
#[tokio::test] #[tokio::test]
async fn test_sequence_factory() { async fn test_sequence_factory() {
let mdc = ModelDeploymentCard::from_local_path("tests/data/sample-models/TinyLlama_v1.1", None) let mdc = ModelDeploymentCard::load("tests/data/sample-models/TinyLlama_v1.1")
.await .await
.unwrap(); .unwrap();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment