Unverified Commit ad8ad66b authored by Graham King's avatar Graham King Committed by GitHub
Browse files

feat: Shrink the ai-dynamo wheel by 35 MiB (#1918)

Remove http and llmctl binaries. They have been unused for a while.
parent 480b41d1
......@@ -55,8 +55,6 @@ cargo doc --no-deps
# create symlinks for the binaries in the deploy directory
mkdir -p $HOME/dynamo/deploy/sdk/src/dynamo/sdk/cli/bin
ln -sf $HOME/dynamo/.build/target/debug/dynamo-run $HOME/dynamo/deploy/sdk/src/dynamo/sdk/cli/bin/dynamo-run
ln -sf $HOME/dynamo/.build/target/debug/http $HOME/dynamo/deploy/sdk/src/dynamo/sdk/cli/bin/http
ln -sf $HOME/dynamo/.build/target/debug/llmctl $HOME/dynamo/deploy/sdk/src/dynamo/sdk/cli/bin/llmctl
# install the python bindings
cd $HOME/dynamo/lib/bindings/python && retry maturin develop
......
......@@ -714,12 +714,6 @@ version = "3.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf"
[[package]]
name = "bytecount"
version = "0.6.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce"
[[package]]
name = "bytemuck"
version = "1.23.1"
......@@ -2973,18 +2967,6 @@ dependencies = [
"itoa",
]
[[package]]
name = "http"
version = "0.3.2"
dependencies = [
"clap 4.5.40",
"dynamo-llm",
"dynamo-runtime",
"serde",
"serde_json",
"tokio",
]
[[package]]
name = "http"
version = "1.3.1"
......@@ -3730,21 +3712,6 @@ dependencies = [
"toktrie 0.7.29",
]
[[package]]
name = "llmctl"
version = "0.3.2"
dependencies = [
"anyhow",
"clap 4.5.40",
"dynamo-llm",
"dynamo-runtime",
"serde",
"serde_json",
"tabled",
"tokio",
"tracing",
]
[[package]]
name = "local-ip-address"
version = "0.6.4"
......@@ -4725,17 +4692,6 @@ dependencies = [
"serde",
]
[[package]]
name = "papergrid"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b915f831b85d984193fdc3d3611505871dc139b2534530fa01c1a6a6707b6723"
dependencies = [
"bytecount",
"fnv",
"unicode-width 0.2.0",
]
[[package]]
name = "parking_lot"
version = "0.12.4"
......@@ -6848,29 +6804,6 @@ dependencies = [
"version-compare",
]
[[package]]
name = "tabled"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "121d8171ee5687a4978d1b244f7d99c43e7385a272185a2f1e1fa4dc0979d444"
dependencies = [
"papergrid",
"tabled_derive",
]
[[package]]
name = "tabled_derive"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52d9946811baad81710ec921809e2af67ad77719418673b2a3794932d57b7538"
dependencies = [
"heck 0.5.0",
"proc-macro-error2",
"proc-macro2",
"quote",
"syn 2.0.100",
]
[[package]]
name = "target-lexicon"
version = "0.12.16"
......
......@@ -15,7 +15,6 @@
[workspace]
members = [
"components/http",
"components/metrics",
"components/router",
"launch/*",
......
......@@ -120,9 +120,7 @@ dynamo-build:
# Remove existing symlinks
rm -f /workspace/deploy/sdk/src/dynamo/sdk/cli/bin/* && \
# Create new symlinks pointing to the correct location
ln -sf /workspace/target/release/dynamo-run /workspace/deploy/sdk/src/dynamo/sdk/cli/bin/dynamo-run && \
ln -sf /workspace/target/release/http /workspace/deploy/sdk/src/dynamo/sdk/cli/bin/http && \
ln -sf /workspace/target/release/llmctl /workspace/deploy/sdk/src/dynamo/sdk/cli/bin/llmctl
ln -sf /workspace/target/release/dynamo-run /workspace/deploy/sdk/src/dynamo/sdk/cli/bin/dynamo-run
RUN cd /workspace/lib/bindings/python && \
......
......@@ -172,8 +172,6 @@ Otherwise, to develop locally, we recommend working inside of the container
cargo build --release
mkdir -p /workspace/deploy/sdk/src/dynamo/sdk/cli/bin
cp /workspace/target/release/http /workspace/deploy/sdk/src/dynamo/sdk/cli/bin
cp /workspace/target/release/llmctl /workspace/deploy/sdk/src/dynamo/sdk/cli/bin
cp /workspace/target/release/dynamo-run /workspace/deploy/sdk/src/dynamo/sdk/cli/bin
uv pip install -e .
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[package]
name = "http"
version.workspace = true
edition.workspace = true
authors.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
[dependencies]
dynamo-runtime = { workspace = true}
dynamo-llm = { workspace = true}
serde = { workspace = true }
serde_json = { workspace = true }
tokio = { workspace = true }
clap = { version = "4.5", features = ["derive"] }
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use clap::Parser;
use dynamo_llm::discovery::{ModelWatcher, MODEL_ROOT_PATH};
use dynamo_llm::http::service::service_v2::HttpService;
use dynamo_runtime::{
logging, pipeline::RouterMode, transports::etcd::PrefixWatcher, DistributedRuntime, Result,
Runtime, Worker,
};
#[derive(Parser)]
#[command(author, version, about, long_about = None)]
struct Args {
/// Host for the HTTP service
#[arg(long, default_value = "0.0.0.0")]
host: String,
/// Port number for the HTTP service
#[arg(short, long, default_value = "8080")]
port: u16,
/// Namespace for the distributed component
#[arg(long, default_value = "public")]
namespace: String,
/// Component name for the service
#[arg(long, default_value = "http")]
component: String,
}
#[tokio::main]
async fn main() -> Result<()> {
logging::init();
let worker = Worker::from_current()?;
worker.execute_async(app).await
}
async fn app(runtime: Runtime) -> Result<()> {
let distributed = DistributedRuntime::from_settings(runtime.clone()).await?;
let args = Args::parse();
let http_service = HttpService::builder()
.port(args.port)
.host(args.host)
.build()?;
let manager = http_service.state().manager_clone();
// todo - use the IntoComponent trait to register the component
// todo - start a service
// todo - we want the service to create an entry and register component definition
// todo - the component definition should be the type of component and it's config
// in this example we will have an HttpServiceComponentDefinition object which will be
// written to etcd
// the cli when operating on an `http` component will validate the namespace.component is
// registered with HttpServiceComponentDefinition
let watch_obj = ModelWatcher::new(distributed.clone(), manager, RouterMode::Random, None);
if let Some(etcd_client) = distributed.etcd_client() {
let models_watcher: PrefixWatcher =
etcd_client.kv_get_and_watch_prefix(MODEL_ROOT_PATH).await?;
let (_prefix, _watcher, receiver) = models_watcher.dissolve();
tokio::spawn(async move {
watch_obj.watch(receiver).await;
});
}
// Run the service
http_service.run(runtime.child_token()).await
}
......@@ -49,8 +49,6 @@ ENV CARGO_TARGET_DIR=/workspace/target
RUN cargo build --release --locked && \
cargo doc --no-deps && \
cp target/release/dynamo-run /usr/local/bin && \
cp target/release/http /usr/local/bin && \
cp target/release/llmctl /usr/local/bin && \
cp target/release/metrics /usr/local/bin && \
cp target/release/mock_worker /usr/local/bin
......
......@@ -367,8 +367,6 @@ RUN mkdir -p /opt/dynamo/bindings/wheels && \
cp target/release/libdynamo_llm_capi.so /opt/dynamo/bindings/lib/. && \
cp -r lib/bindings/c/include /opt/dynamo/bindings/. && \
cp target/release/dynamo-run /usr/local/bin && \
cp target/release/http /usr/local/bin && \
cp target/release/llmctl /usr/local/bin && \
cp target/release/metrics /usr/local/bin && \
cp target/release/mock_worker /usr/local/bin
......
......@@ -132,8 +132,6 @@ ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16}
RUN cargo build --release
RUN mkdir -p deploy/sdk/src/dynamo/sdk/cli/bin
RUN cp target/release/http deploy/sdk/src/dynamo/sdk/cli/bin
RUN cp target/release/llmctl deploy/sdk/src/dynamo/sdk/cli/bin
RUN cp target/release/dynamo-run deploy/sdk/src/dynamo/sdk/cli/bin
RUN cd lib/bindings/python && pip install --break-system-packages -e . && cd ../../..
......
......@@ -317,8 +317,6 @@ RUN mkdir -p /opt/dynamo/bindings/wheels && \
cp target/release/libdynamo_llm_capi.so /opt/dynamo/bindings/lib/. && \
cp -r lib/bindings/c/include /opt/dynamo/bindings/. && \
cp target/release/dynamo-run /usr/local/bin && \
cp target/release/http /usr/local/bin && \
cp target/release/llmctl /usr/local/bin && \
cp target/release/metrics /usr/local/bin && \
cp target/release/mock_worker /usr/local/bin
......
......@@ -432,8 +432,6 @@ RUN mkdir -p /opt/dynamo/bindings/wheels && \
cp target/release/libdynamo_llm_capi.so /opt/dynamo/bindings/lib/. && \
cp -r lib/bindings/c/include /opt/dynamo/bindings/. && \
cp target/release/dynamo-run /usr/local/bin && \
cp target/release/http /usr/local/bin && \
cp target/release/llmctl /usr/local/bin && \
cp target/release/metrics /usr/local/bin && \
cp target/release/mock_worker /usr/local/bin
......
......@@ -412,8 +412,6 @@ RUN mkdir -p /opt/dynamo/bindings/wheels && \
cp target/release/libdynamo_llm_capi.so /opt/dynamo/bindings/lib/. && \
cp -r lib/bindings/c/include /opt/dynamo/bindings/. && \
cp target/release/dynamo-run /usr/local/bin && \
cp target/release/http /usr/local/bin && \
cp target/release/llmctl /usr/local/bin && \
cp target/release/metrics /usr/local/bin && \
cp target/release/mock_worker /usr/local/bin
......
......@@ -69,28 +69,6 @@ def dynamo_run(args=None):
return result.returncode
def llmctl(args=None):
"""
Run the llmctl executable with the provided arguments.
If no args provided, passes through sys.argv[1:] to the executable.
"""
if args is None:
args = sys.argv[1:]
result = run_executable("llmctl", args=args, capture_output=False)
return result.returncode
def http(args=None):
"""
Run the http executable with the provided arguments.
If no args provided, passes through sys.argv[1:] to the executable.
"""
if args is None:
args = sys.argv[1:]
result = run_executable("http", args=args, capture_output=False)
return result.returncode
def metrics(args=None):
"""
Run the metrics executable with the provided arguments.
......
......@@ -206,8 +206,6 @@ Otherwise, to develop locally, we recommend working inside of the container:
cargo build --release
mkdir -p /workspace/deploy/dynamo/sdk/src/dynamo/sdk/cli/bin
cp /workspace/target/release/http /workspace/deploy/dynamo/sdk/src/dynamo/sdk/cli/bin
cp /workspace/target/release/llmctl /workspace/deploy/dynamo/sdk/src/dynamo/sdk/cli/bin
cp /workspace/target/release/dynamo-run /workspace/deploy/dynamo/sdk/src/dynamo/sdk/cli/bin
uv pip install -e .
......
......@@ -92,10 +92,6 @@ class ResponseType(BaseModel):
# Add other fields as needed
```
For example, if you deploy your worker directly behind an OpenAI HTTP (`http`) service
using `llmctl`, you can define the request and response types to correspond to
Chat Completions objects, such as the ones specified in the OpenAI API. For example:
```python
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
......
......@@ -24,8 +24,6 @@ class CustomBuildHook(BuildHookInterface):
bin_path = os.getenv("DYNAMO_BIN_PATH", "target/release")
build_data["force_include"] = {
f"{bin_path}/dynamo-run": "dynamo/sdk/cli/bin/dynamo-run",
f"{bin_path}/llmctl": "dynamo/sdk/cli/bin/llmctl",
f"{bin_path}/http": "dynamo/sdk/cli/bin/http",
f"{bin_path}/metrics": "dynamo/sdk/cli/bin/metrics",
f"{bin_path}/mock_worker": "dynamo/sdk/cli/bin/mock_worker",
f"{bin_path}/libdynamo_llm_capi.so": "dynamo/sdk/cli/bin/libdynamo_llm_capi.so",
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[package]
name = "llmctl"
version.workspace = true
edition.workspace = true
authors.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
[dependencies]
anyhow = { workspace = true }
dynamo-runtime = { workspace = true }
dynamo-llm = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
tracing = { workspace = true }
tokio = { workspace = true }
clap = { version = "4.5", features = ["derive"] }
tabled = { version = "0.18" }
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use std::sync::Arc;
use clap::{Parser, Subcommand};
use dynamo_llm::discovery::{ModelManager, ModelWatcher};
use dynamo_llm::local_model::{LocalModelBuilder, ModelNetworkName};
use dynamo_llm::model_type::ModelType;
use dynamo_runtime::component::Endpoint;
use dynamo_runtime::pipeline::RouterMode;
use dynamo_runtime::{
distributed::DistributedConfig, logging, DistributedRuntime, Result, Runtime, Worker,
};
// Macro to define model types and associated commands
macro_rules! define_type_subcommands {
($(($variant:ident, $primary_name:expr, [$($alias:expr),*], $help:expr)),* $(,)?) => {
#[derive(Subcommand)]
enum AddCommands {
$(
#[doc = $help]
#[command(name = $primary_name, aliases = [$($alias),*])]
$variant(AddModelArgs),
)*
}
#[derive(Subcommand)]
enum ListCommands {
$(
#[doc = concat!("List ", $primary_name, " models")]
#[command(name = $primary_name, aliases = [$($alias),*])]
$variant,
)*
}
#[derive(Subcommand)]
enum RemoveCommands {
$(
#[doc = concat!("Remove ", $primary_name, " model")]
#[command(name = $primary_name, aliases = [$($alias),*])]
$variant(RemoveModelArgs),
)*
}
impl AddCommands {
fn into_parts(self) -> (ModelType, String, String) {
match self {
$(Self::$variant(args) => (ModelType::$variant, args.model_name, args.endpoint_name)),*
}
}
}
impl RemoveCommands {
fn into_parts(self) -> (ModelType, String) {
match self {
$(Self::$variant(args) => (ModelType::$variant, args.model_name)),*
}
}
}
impl ListCommands {
fn model_type(&self) -> ModelType {
match self {
$(Self::$variant => ModelType::$variant),*
}
}
}
}
}
define_type_subcommands!(
(
Chat,
"chat",
["chat-model", "chat-models"],
"Add a chat model"
),
(
Completion,
"completion",
["completions", "completion-model"],
"Add a completion model"
),
// Add new model types here:
(
Embedding,
"embedding",
["embeddings", "embedding-model"],
"Add an embedding model"
)
);
#[derive(Parser)]
#[command(
author="NVIDIA",
version="0.2.1",
about="LLMCTL - Deprecated. Do not use.",
long_about = None,
disable_help_subcommand = true,
)]
struct Cli {
/// Public Namespace to operate in
/// Do not use this. In fact don't use anything about this file.
#[arg(short = 'n', long)]
public_namespace: Option<String>,
#[command(subcommand)]
command: Commands,
}
#[derive(Subcommand)]
enum Commands {
/// HTTP service related commands
Http {
#[command(subcommand)]
command: HttpCommands,
},
}
#[derive(Subcommand)]
enum HttpCommands {
/// Add models
Add {
#[command(subcommand)]
model_type: AddCommands,
},
/// List models (all types if no specific type provided)
List {
#[command(subcommand)]
model_type: Option<ListCommands>,
},
/// Remove models
Remove {
#[command(subcommand)]
model_type: RemoveCommands,
},
}
#[derive(Parser)]
struct AddModelArgs {
/// Model name (e.g. foo/v1)
#[arg(name = "model-name")]
model_name: String,
/// Endpoint name (format: component.endpoint or namespace.component.endpoint)
#[arg(name = "endpoint-name")]
endpoint_name: String,
}
/// Common fields for removing any model type
#[derive(Parser)]
struct RemoveModelArgs {
/// Name of the model to remove
#[arg(name = "model-name")]
model_name: String,
}
fn main() -> Result<()> {
logging::init();
let cli = Cli::parse();
// Default namespace to "dynamo" if not specified
let namespace = cli.public_namespace.unwrap_or_else(|| "dynamo".to_string());
let worker = Worker::from_settings()?;
worker.execute(|runtime| async move { handle_command(runtime, namespace, cli.command).await })
}
async fn handle_command(runtime: Runtime, namespace: String, command: Commands) -> Result<()> {
let settings = DistributedConfig::for_cli();
let distributed = DistributedRuntime::new(runtime, settings).await?;
match command {
Commands::Http { command } => {
match command {
HttpCommands::Add { model_type } => {
let (model_type, model_name, endpoint_name) = model_type.into_parts();
add_model(
&distributed,
namespace.to_string(),
model_type,
model_name,
&endpoint_name,
)
.await?;
}
HttpCommands::List { model_type } => {
match model_type {
Some(model_type) => {
list_models(
&distributed,
namespace.clone(),
Some(model_type.model_type()),
)
.await?;
}
None => {
// List all model types
list_models(&distributed, namespace.clone(), None).await?;
}
}
}
HttpCommands::Remove { model_type } => {
let (model_type, name) = model_type.into_parts();
remove_model(&distributed, model_type, &name).await?;
}
}
}
}
Ok(())
}
async fn add_model(
distributed: &DistributedRuntime,
namespace: String,
model_type: ModelType,
model_name: String,
endpoint_name: &str,
) -> Result<()> {
tracing::debug!("Adding model {model_name} with endpoint {endpoint_name}");
if model_name.starts_with('/') {
anyhow::bail!("Model name '{model_name}' cannot start with a slash");
}
let endpoint = endpoint_from_name(distributed, &namespace, endpoint_name)?;
let mut model = LocalModelBuilder::default()
.model_name(Some(model_name))
.build()
.await?;
model.attach(&endpoint, model_type).await?;
Ok(())
}
#[derive(tabled::Tabled)]
struct ModelRow {
#[tabled(rename = "MODEL TYPE")]
model_type: String,
#[tabled(rename = "MODEL NAME")]
name: String,
#[tabled(rename = "NAMESPACE")]
namespace: String,
#[tabled(rename = "COMPONENT")]
component: String,
#[tabled(rename = "ENDPOINT")]
endpoint: String,
}
async fn list_models(
distributed: &DistributedRuntime,
namespace: String,
model_type: Option<ModelType>,
) -> Result<()> {
// We only need a ModelWatcher to call it's all_entries. llmctl is going away so no need to
// refactor for this.
let watcher = ModelWatcher::new(
distributed.clone(),
Arc::new(ModelManager::new()),
RouterMode::Random,
None,
);
let mut models = Vec::new();
for entry in watcher.all_entries().await? {
match (model_type, entry.model_type) {
(None, _) => {
// list all
}
(Some(want), got) if want == got => {
// match
}
_ => {
// no match
continue;
}
}
models.push(ModelRow {
model_type: entry.model_type.as_str().to_string(),
name: entry.name,
namespace: entry.endpoint.namespace,
component: entry.endpoint.component,
endpoint: entry.endpoint.name,
});
}
if models.is_empty() {
match &model_type {
Some(mt) => println!(
"No {} models found in namespace: {}",
mt.as_str(),
namespace
),
None => println!("No models found in namespace: {}", namespace),
}
} else {
let table = tabled::Table::new(models);
match &model_type {
Some(mt) => println!("Listing {} models in namespace: {}", mt.as_str(), namespace),
None => println!("Listing all models in namespace: {}", namespace),
}
println!("{}", table);
}
Ok(())
}
async fn remove_model(
distributed: &DistributedRuntime,
model_type: ModelType,
model_name: &str,
) -> Result<()> {
// We have to do this manually because normally the etcd lease system does it for us
let watcher = ModelWatcher::new(
distributed.clone(),
Arc::new(ModelManager::new()),
RouterMode::Random,
None,
);
let Some(etcd_client) = distributed.etcd_client() else {
anyhow::bail!("llmctl is only useful with dynamic workers");
};
let active_instances = watcher.entries_for_model(model_name).await?;
for entry in active_instances
.into_iter()
.filter(|entry| entry.model_type == model_type)
{
let network_name = ModelNetworkName::from_entry(&entry, 0);
tracing::debug!("deleting key: {network_name}");
etcd_client
.kv_delete(network_name.to_string(), None)
.await?;
}
Ok(())
}
fn endpoint_from_name(
distributed: &DistributedRuntime,
namespace: &str,
endpoint_name: &str,
) -> anyhow::Result<Endpoint> {
let parts: Vec<&str> = endpoint_name.split('.').collect();
if parts.len() < 2 {
anyhow::bail!("Endpoint name '{}' is too short. Format should be 'component.endpoint' or 'namespace.component.endpoint'", endpoint_name);
} else if parts.len() > 3 {
anyhow::bail!("Endpoint name '{}' is too long. Format should be 'component.endpoint' or 'namespace.component.endpoint'", endpoint_name);
}
// TODO previous version sometime hardcoded this to "http", so maybe adjust
let component_name = parts[parts.len() - 2].to_string();
let endpoint_name = parts[parts.len() - 1].to_string();
let component = distributed
.namespace(namespace)?
.component(component_name)?;
Ok(component.endpoint(endpoint_name))
}
......@@ -71,8 +71,6 @@ vllm = [
[project.scripts]
dynamo = "dynamo.sdk.cli.cli:cli"
dynamo-run = "dynamo.sdk.cli.run_executable:dynamo_run"
llmctl = "dynamo.sdk.cli.run_executable:llmctl"
http = "dynamo.sdk.cli.run_executable:http"
metrics = "dynamo.sdk.cli.run_executable:metrics"
mock_worker = "dynamo.sdk.cli.run_executable:mock_worker"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment