refactor: move libs to lib dir

Signed-off-by: Neelay Shah <neelays@nvidia.com> Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>

refactor: move libs to lib dir
Signed-off-by: Neelay Shah <neelays@nvidia.com> Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>
08fcd7e9 · Neelay Shah · GitHub · 0bfd9a76 · 08fcd7e9 · 08fcd7e9
Commit 08fcd7e9 authored Feb 24, 2025 by Neelay Shah Committed by GitHub Feb 24, 2025
20 changed files
--- a/.github/workflows/copyright-check.ps1
+++ b/.github/workflows/copyright-check.ps1
@@ -122,7 +122,7 @@ $global:copyright_results = @{
 $ignored_files = @('.clang-format', '.gitattributes', '.gitignore', '.gitkeep', '.patch', 'Cargo.lock', 'LICENSE', 'uv.lock', 'rust-toolchain.toml')
 write-debug "<copyright-check> ignored_files = ['$($ignored_files -join "','")']."
-$ignored_paths = @('.github', '.mypy_cache', '.pytest_cache', 'llm/rust/triton-llm/tests/data/sample-models')
+$ignored_paths = @('.github', '.mypy_cache', '.pytest_cache', 'lib/llm/tests/data/sample-models')
 write-debug "<copyright-check> ignored_paths = ['$($ignored_paths -join "','")']."
 $ignored_types = @('.bat', '.gif', '.ico', '.ipynb', '.jpg', '.jpeg', '.patch', '.png', '.pyc', '.pyi', '.rst', '.zip', '.md')
 write-debug "<copyright-check> ignored_types = ['$($ignored_types -join "', '")']."

--- a/.github/workflows/pre-merge-rust.yml
+++ b/.github/workflows/pre-merge-rust.yml
@@ -26,8 +26,8 @@ on:
    branches:
    - main
    paths:
-    - 'runtime/rust/**'
+    - 'lib/runtime/**'
-    - 'llm/rust/**'
+    - 'lib/llm/**'
    - 'applications/llm/tio/**'
    - '**.rs'
    - 'Cargo.toml'
@@ -65,26 +65,26 @@ jobs:
    - name: Set up Rust Toolchain Components
      run: rustup component add rustfmt clippy
    - name: Run Cargo Check on runtime
-      working-directory: runtime/rust
+      working-directory: lib/runtime
      run: cargo check --locked
    - name: Run Cargo Check on tio
      working-directory: applications/llm/tio
      run: cargo check --locked
    - name: Verify Code Formatting
-      working-directory: runtime/rust
+      working-directory: lib/runtime
      run: cargo fmt -- --check
    - name: Run Clippy Checks on runtime
-      working-directory: runtime/rust
+      working-directory: lib/runtime
      run: cargo clippy --no-deps --all-targets -- -D warnings
    - name: Run Clippy Checks on tio
      working-directory: applications/llm/tio
      run: cargo clippy --no-deps --all-targets -- -D warnings
    - name: Install and Run cargo-deny
-      working-directory: runtime/rust
+      working-directory: lib/runtime
      run: |
-        cargo-deny --version || cargo install cargo-deny
+        cargo-deny --version || cargo install cargo-deny@0.16.4
        cargo-deny check --hide-inclusion-graph licenses
      timeout-minutes: 5
    - name: Run Unit Tests
-      working-directory: runtime/rust
+      working-directory: lib/runtime
      run: cargo test --locked --all-targets
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -43,7 +43,8 @@ repos:
  - id: codespell
    additional_dependencies: [tomli]
    args: ["--toml", "pyproject.toml"]
-    exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$|.*tests/data/*)
+    exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$|.*lib/llm/tests/data.*)
 # More details about these pre-commit hooks here:
 # https://pre-commit.com/hooks.html
 - repo: https://github.com/pre-commit/pre-commit-hooks

--- a/applications/llm/count/Cargo.lock
+++ b/applications/llm/count/Cargo.lock
@@ -500,8 +500,8 @@ dependencies = [
 "serde_json",
 "tokio",
 "tracing",
- "triton-distributed",
+ "triton-distributed-llm",
- "triton-llm",
+ "triton-distributed-runtime",
 ]
 [[package]]
@@ -3385,85 +3385,85 @@ dependencies = [
 ]
 [[package]]
-name = "triton-distributed"
+name = "triton-distributed-llm"
 version = "0.2.0"
 dependencies = [
 "anyhow",
- "async-nats",
- "async-once-cell",
 "async-stream",
 "async-trait",
- "async_zmq",
+ "axum 0.8.1",
 "blake3",
+ "bs62",
 "bytes",
 "chrono",
- "derive-getters",
 "derive_builder",
- "educe",
 "either",
- "etcd-client",
+ "erased-serde",
- "figment",
 "futures",
- "humantime",
+ "galil-seiferas",
- "local-ip-address",
+ "indexmap 2.7.1",
- "log",
+ "itertools 0.14.0",
- "nid",
+ "minijinja",
- "nix",
+ "minijinja-contrib",
- "nuid",
- "once_cell",
 "prometheus",
- "rand",
 "regex",
+ "semver",
 "serde",
 "serde_json",
- "socket2",
+ "thiserror 2.0.11",
- "thiserror 1.0.69",
+ "tokenizers",
 "tokio",
 "tokio-stream",
 "tokio-util",
+ "toktrie",
+ "toktrie_hf_tokenizers",
 "tracing",
- "tracing-subscriber",
+ "triton-distributed-runtime",
+ "unicode-segmentation",
 "uuid",
 "validator",
 "xxhash-rust",
 ]
 [[package]]
-name = "triton-llm"
+name = "triton-distributed-runtime"
 version = "0.2.0"
 dependencies = [
 "anyhow",
+ "async-nats",
+ "async-once-cell",
 "async-stream",
 "async-trait",
- "axum 0.8.1",
+ "async_zmq",
 "blake3",
- "bs62",
 "bytes",
 "chrono",
+ "derive-getters",
 "derive_builder",
+ "educe",
 "either",
- "erased-serde",
+ "etcd-client",
+ "figment",
 "futures",
- "galil-seiferas",
+ "humantime",
- "indexmap 2.7.1",
+ "local-ip-address",
- "itertools 0.14.0",
+ "log",
- "minijinja",
+ "nid",
- "minijinja-contrib",
+ "nix",
+ "nuid",
+ "once_cell",
 "prometheus",
+ "rand",
 "regex",
- "semver",
 "serde",
 "serde_json",
- "thiserror 2.0.11",
+ "socket2",
- "tokenizers",
+ "thiserror 1.0.69",
 "tokio",
 "tokio-stream",
 "tokio-util",
- "toktrie",
- "toktrie_hf_tokenizers",
 "tracing",
- "triton-distributed",
+ "tracing-subscriber",
- "unicode-segmentation",
 "uuid",
 "validator",
 "xxhash-rust",

--- a/applications/llm/count/Cargo.toml
+++ b/applications/llm/count/Cargo.toml
@@ -20,8 +20,8 @@ edition = "2021"
 [dependencies]
 # local
-triton-distributed = { path = "../../../runtime/rust" }
+triton-distributed-runtime = { path = "../../../lib/runtime" }
-triton-llm = { path = "../../../llm/rust/triton-llm" }
+triton-distributed-llm = { path = "../../../lib/llm" }
 # workspace - todo

--- a/applications/llm/count/src/main.rs
+++ b/applications/llm/count/src/main.rs
@@ -25,7 +25,7 @@
 use serde::{Deserialize, Serialize};
-use triton_distributed::{
+use triton_distributed_runtime::{
    error, logging,
    traits::events::EventPublisher,
    utils::{Duration, Instant},

--- a/applications/llm/tio/Cargo.lock
+++ b/applications/llm/tio/Cargo.lock
@@ -4744,8 +4744,8 @@ dependencies = [
 "tokio-util",
 "tracing",
 "tracing-subscriber",
- "triton-distributed",
+ "triton-distributed-llm",
- "triton-llm",
+ "triton-distributed-runtime",
 ]
 [[package]]
@@ -5153,86 +5153,86 @@ dependencies = [
 ]
 [[package]]
-name = "triton-distributed"
+name = "triton-distributed-llm"
 version = "0.2.0"
 dependencies = [
 "anyhow",
- "async-nats",
- "async-once-cell",
 "async-stream",
 "async-trait",
- "async_zmq",
+ "axum 0.8.1",
 "blake3",
+ "bs62",
 "bytes",
 "chrono",
- "derive-getters",
 "derive_builder",
- "educe",
 "either",
- "etcd-client",
+ "erased-serde",
- "figment",
 "futures",
- "humantime",
+ "galil-seiferas",
- "local-ip-address",
+ "indexmap 2.7.1",
- "log",
+ "itertools 0.14.0",
- "nid",
+ "minijinja",
- "nix 0.29.0",
+ "minijinja-contrib",
- "nuid",
+ "mistralrs",
- "once_cell",
 "prometheus",
- "rand",
 "regex",
+ "semver",
 "serde",
 "serde_json",
- "socket2",
+ "thiserror 2.0.11",
- "thiserror 1.0.69",
+ "tokenizers",
 "tokio",
 "tokio-stream",
 "tokio-util",
+ "toktrie 0.6.28",
+ "toktrie_hf_tokenizers 0.6.28",
 "tracing",
- "tracing-subscriber",
+ "triton-distributed-runtime",
+ "unicode-segmentation",
 "uuid 1.14.0",
 "validator",
 "xxhash-rust",
 ]
 [[package]]
-name = "triton-llm"
+name = "triton-distributed-runtime"
 version = "0.2.0"
 dependencies = [
 "anyhow",
+ "async-nats",
+ "async-once-cell",
 "async-stream",
 "async-trait",
- "axum 0.8.1",
+ "async_zmq",
 "blake3",
- "bs62",
 "bytes",
 "chrono",
+ "derive-getters",
 "derive_builder",
+ "educe",
 "either",
- "erased-serde",
+ "etcd-client",
+ "figment",
 "futures",
- "galil-seiferas",
+ "humantime",
- "indexmap 2.7.1",
+ "local-ip-address",
- "itertools 0.14.0",
+ "log",
- "minijinja",
+ "nid",
- "minijinja-contrib",
+ "nix 0.29.0",
- "mistralrs",
+ "nuid",
+ "once_cell",
 "prometheus",
+ "rand",
 "regex",
- "semver",
 "serde",
 "serde_json",
- "thiserror 2.0.11",
+ "socket2",
- "tokenizers",
+ "thiserror 1.0.69",
 "tokio",
 "tokio-stream",
 "tokio-util",
- "toktrie 0.6.28",
- "toktrie_hf_tokenizers 0.6.28",
 "tracing",
- "triton-distributed",
+ "tracing-subscriber",
- "unicode-segmentation",
 "uuid 1.14.0",
 "validator",
 "xxhash-rust",

--- a/applications/llm/tio/Cargo.toml
+++ b/applications/llm/tio/Cargo.toml
@@ -21,9 +21,9 @@ authors = ["NVIDIA"]
 homepage = "https://github.com/triton-inference-server/triton_distributed"
 [features]
-mistralrs = ["triton-llm/mistralrs"]
+mistralrs = ["triton-distributed-llm/mistralrs"]
-cuda = ["triton-llm/cuda"]
+cuda = ["triton-distributed-llm/cuda"]
-metal = ["triton-llm/metal"]
+metal = ["triton-distributed-llm/metal"]
 [dependencies]
 anyhow = "1"
@@ -42,5 +42,5 @@ tokio = { version = "1", features = ["full"] }
 tokio-util = { version = "0.7", features = ["codec", "net"] }
 tracing = { version = "0.1" }
 tracing-subscriber = { version = "0.3", features = ["env-filter", "local-time", "json"] }
-triton-distributed = { path = "../../../runtime/rust" }
+triton-distributed-runtime = { path = "../../../lib/runtime" }
-triton-llm = { path = "../../../llm/rust/triton-llm" }
+triton-distributed-llm = { path = "../../../lib/llm" }
--- a/applications/llm/tio/src/input/endpoint.rs
+++ b/applications/llm/tio/src/input/endpoint.rs
@@ -13,10 +13,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-use triton_distributed::{
+use triton_distributed_runtime::{
    pipeline::network::Ingress, protocols::Endpoint, DistributedRuntime, Runtime,
 };
-use triton_llm::http::service::discovery::ModelEntry;
+use triton_distributed_llm::http::service::discovery::ModelEntry;
 use crate::{EngineConfig, ENDPOINT_SCHEME};

--- a/applications/llm/tio/src/input/http.rs
+++ b/applications/llm/tio/src/input/http.rs
@@ -15,8 +15,8 @@
 use std::sync::Arc;
-use triton_distributed::{DistributedRuntime, Runtime};
+use triton_distributed_runtime::{DistributedRuntime, Runtime};
-use triton_llm::http::service::{discovery, service_v2};
+use triton_distributed_llm::http::service::{discovery, service_v2};
 use crate::EngineConfig;

--- a/applications/llm/tio/src/input/text.rs
+++ b/applications/llm/tio/src/input/text.rs
@@ -18,8 +18,8 @@ use std::{
    io::{ErrorKind, Read, Write},
    sync::Arc,
 };
-use triton_distributed::{pipeline::Context, runtime::CancellationToken};
+use triton_distributed_runtime::{pipeline::Context, runtime::CancellationToken};
-use triton_llm::{
+use triton_distributed_llm::{
    protocols::openai::chat_completions::MessageRole,
    types::openai::chat_completions::{
        ChatCompletionRequest, OpenAIChatCompletionsStreamingEngine,

--- a/applications/llm/tio/src/lib.rs
+++ b/applications/llm/tio/src/lib.rs
@@ -15,8 +15,8 @@
 use std::path::PathBuf;
-use triton_distributed::{component::Client, DistributedRuntime};
+use triton_distributed_runtime::{component::Client, DistributedRuntime};
-use triton_llm::types::{
+use triton_distributed_llm::types::{
    openai::chat_completions::{
        ChatCompletionRequest, ChatCompletionResponseDelta, OpenAIChatCompletionsStreamingEngine,
    },
@@ -68,7 +68,7 @@ pub enum EngineConfig {
 }
 pub async fn run(
-    runtime: triton_distributed::Runtime,
+    runtime: triton_distributed_runtime::Runtime,
    in_opt: Input,
    out_opt: Output,
    flags: Flags,
@@ -138,7 +138,7 @@ pub async fn run(
            };
            EngineConfig::StaticFull {
                service_name: model_name,
-                engine: triton_llm::engines::mistralrs::make_engine(&model_path).await?,
+                engine: triton_distributed_llm::engines::mistralrs::make_engine(&model_path).await?,
            }
        }
    };

--- a/applications/llm/tio/src/main.rs
+++ b/applications/llm/tio/src/main.rs
@@ -18,7 +18,7 @@ use std::env;
 use clap::Parser;
 use tio::{Input, Output};
-use triton_distributed::logging;
+use triton_distributed_runtime::logging;
 const HELP: &str = r#"
 tio is a single binary that wires together the various inputs (http, text, network) and workers (network, engine), that runs the services. It is the simplest way to use triton-distributed locally.
@@ -45,15 +45,15 @@ fn main() -> anyhow::Result<()> {
    logging::init();
    // max_worker_threads and max_blocking_threads from env vars or config file.
-    let rt_config = triton_distributed::RuntimeConfig::from_settings()?;
+    let rt_config = triton_distributed_runtime::RuntimeConfig::from_settings()?;
    // One per process. Wraps a Runtime with holds two tokio runtimes.
-    let worker = triton_distributed::Worker::from_config(rt_config)?;
+    let worker = triton_distributed_runtime::Worker::from_config(rt_config)?;
    worker.execute(tio_wrapper)
 }
-async fn tio_wrapper(runtime: triton_distributed::Runtime) -> anyhow::Result<()> {
+async fn tio_wrapper(runtime: triton_distributed_runtime::Runtime) -> anyhow::Result<()> {
    let mut in_opt = None;
    let mut out_opt = None;
    let args: Vec<String> = env::args().skip(1).collect();

--- a/applications/llm/tio/src/output/echo_full.rs
+++ b/applications/llm/tio/src/output/echo_full.rs
@@ -18,14 +18,14 @@ use std::{sync::Arc, time::Duration};
 use async_stream::stream;
 use async_trait::async_trait;
-use triton_distributed::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
+use triton_distributed_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
-use triton_distributed::pipeline::{Error, ManyOut, SingleIn};
+use triton_distributed_runtime::pipeline::{Error, ManyOut, SingleIn};
-use triton_distributed::protocols::annotated::Annotated;
+use triton_distributed_runtime::protocols::annotated::Annotated;
-use triton_llm::protocols::openai::chat_completions::FinishReason;
+use triton_distributed_llm::protocols::openai::chat_completions::FinishReason;
-use triton_llm::protocols::openai::chat_completions::{
+use triton_distributed_llm::protocols::openai::chat_completions::{
    ChatCompletionRequest, ChatCompletionResponseDelta, Content,
 };
-use triton_llm::types::openai::chat_completions::OpenAIChatCompletionsStreamingEngine;
+use triton_distributed_llm::types::openai::chat_completions::OpenAIChatCompletionsStreamingEngine;
 /// How long to sleep between echoed tokens.
 /// 50ms gives us 20 tok/s.

--- a/container/Dockerfile
+++ b/container/Dockerfile
@@ -28,7 +28,6 @@ USER root
 RUN apt-get update && \
    apt-get install --no-install-recommends --yes  gdb protobuf-compiler cmake libssl-dev pkg-config
 RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
 ENV PATH="/root/.cargo/bin:${PATH}"
@@ -131,12 +130,12 @@ RUN apt-get install tmux -y
 # Working directory
 WORKDIR /workspace
-COPY runtime /workspace/runtime
+COPY lib/runtime /workspace/lib/runtime
-RUN cd runtime/rust && \
+RUN cd lib/runtime && \
    cargo build --release --locked && cargo doc --no-deps
 # Build OpenAI HTTP Service binaries
-COPY llm/rust /workspace/llm/rust
+COPY lib/llm /workspace/lib/llm
 COPY examples/rust /workspace/examples/rust
 RUN cd examples/rust && \
    cargo build --release && \
@@ -144,31 +143,30 @@ RUN cd examples/rust && \
    cp target/release/llmctl /usr/local/bin/
 # Generate C bindings. Note that this is required for TRTLLM backend re-build
-COPY llm/rust /workspace/llm/rust
+COPY lib/bindings /workspace/lib/bindings
-RUN cd llm/rust/ && \
+RUN cd lib/bindings/c/ && \
    cargo build --release --locked && cargo doc --no-deps
-# Install uv, create virtualenv for general use, and build triton_distributed_rs wheel
+# Install uv, create virtualenv for general use, and build triton_distributed  wheel
-COPY python-wheel /workspace/python-wheel
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
 RUN mkdir /opt/triton && \
    uv venv /opt/triton/venv --python 3.12 && \
    source /opt/triton/venv/bin/activate && \
-    cd python-wheel && \
+    cd lib/bindings/python && \
    uv build && \
-    uv pip install dist/triton_distributed_rs*cp312*.whl
+    uv pip install /workspace/lib/bindings/python/dist/triton_distributed*cp312*.whl
 # Package the bindings
-RUN mkdir -p /opt/triton/llm_binding/wheels && \
+RUN mkdir -p /opt/triton/bindings/wheels && \
-    mkdir /opt/triton/llm_binding/lib && \
+    mkdir /opt/triton/bindings/lib && \
-    cp python-wheel/dist/triton_distributed_rs*cp312*.whl /opt/triton/llm_binding/wheels/. && \
+    cp lib/bindings/python/dist/triton_distributed*cp312*.whl /opt/triton/bindings/wheels/. && \
-    cp llm/rust/target/release/libtriton_llm_capi.so /opt/triton/llm_binding/lib/. && \
+    cp lib/bindings/c/target/release/libtriton_distributed_llm_capi.so /opt/triton/bindings/lib/. && \
-    cp -r llm/rust/libtriton-llm/include /opt/triton/llm_binding/.
+    cp -r lib/bindings/c/include /opt/triton/bindings/.
-# Install triton_distributed_rs wheel globally in container for tests that
+# Install triton_distributed_runtime and triton_distributed_llm wheels globally in container for tests that
 # currently run without virtual environment activated.
 # TODO: In future, we may use a virtualenv for everything and remove this.
-RUN pip install /opt/triton/llm_binding/wheels/triton_distributed_rs*cp312*.whl
+RUN pip install /opt/triton/bindings/wheels/triton_distributed*cp312*.whl
 # Copy everything in after install steps to avoid re-running build/install
 # commands on unrelated changes in other dirs.

--- a/container/Dockerfile.vllm
+++ b/container/Dockerfile.vllm
@@ -63,21 +63,21 @@ RUN apt update -y && \
    apt install -y \
    build-essential \
    protobuf-compiler \
-     cmake \
+    cmake \
-     libssl-dev \
+    libssl-dev \
-     pkg-config && \
+    pkg-config && \
    curl https://sh.rustup.rs -sSf | bash -s -- -y
 ENV PATH="/root/.cargo/bin:${PATH}"
 # Working directory
 WORKDIR /workspace
-COPY runtime/rust /workspace/runtime/rust
+COPY lib/runtime /workspace/lib/runtime
-RUN cd runtime/rust && \
+RUN cd lib/runtime && \
    cargo build --release --locked && cargo doc --no-deps
 # Build OpenAI HTTP Service binaries
-COPY llm/rust /workspace/llm/rust
+COPY lib/llm /workspace/lib/llm
 COPY examples/rust /workspace/examples/rust
 RUN cd examples/rust && \
    cargo build --release && \
@@ -88,23 +88,25 @@ RUN cd examples/rust && \
 # COPY applications/...
 # Generate C bindings for kv cache routing in vLLM
-COPY llm/rust /workspace/llm/rust
+COPY lib/bindings /workspace/lib/bindings
-RUN cd llm/rust/ && \
+RUN cd lib/bindings/c && \
 cargo build --release --locked && cargo doc --no-deps
-# Build triton_distributed_rs wheel
+# Build triton_distributed wheel
-COPY python-wheel /workspace/python-wheel
+RUN source /opt/triton/venv/bin/activate && \
-RUN cd python-wheel && \
+    cd lib/bindings/python && \
    uv build && \
-    uv pip install dist/triton_distributed_rs*cp312*.whl
+    uv pip install /workspace/lib/bindings/python/dist/triton_distributed*cp312*.whl
 # Package the bindings
-RUN mkdir -p /opt/triton/llm_binding/wheels && mkdir /opt/triton/llm_binding/lib
+RUN mkdir -p /opt/triton/bindings/wheels && \
-RUN cp python-wheel/dist/triton_distributed_rs*cp312*.whl /opt/triton/llm_binding/wheels/.
+    mkdir /opt/triton/bindings/lib && \
-RUN cp llm/rust/target/release/libtriton_llm_capi.so /opt/triton/llm_binding/lib/.
+    cp lib/bindings/python/dist/triton_distributed*cp312*.whl /opt/triton/bindings/wheels/. && \
-RUN cp -r llm/rust/libtriton-llm/include /opt/triton/llm_binding/.
+    cp lib/bindings/c/target/release/libtriton_distributed_llm_capi.so /opt/triton/bindings/lib/. && \
+    cp -r lib/bindings/c/include /opt/triton/bindings/.
 # Tell vllm to use the Triton LLM C API for KV Cache Routing
-ENV VLLM_KV_CAPI_PATH="/opt/triton/llm_binding/lib/libtriton_llm_capi.so"
+ENV VLLM_KV_CAPI_PATH="/opt/triton/bindings/lib/libtriton_distributed_llm_capi.so"
 # FIXME: Copy more specific folders in for dev/debug after directory restructure
 COPY . /workspace
@@ -130,7 +132,7 @@ RUN apt update -y && \
 ENV VIRTUAL_ENV=/opt/triton/venv
 ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
 ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true
-ENV VLLM_KV_CAPI_PATH="/opt/triton/llm_binding/lib/libtriton_llm_capi.so"
+ENV VLLM_KV_CAPI_PATH="/opt/triton/bindings/lib/libtriton_distributed_llm_capi.so"
 # Copy binaries
 COPY --from=dev /usr/local/bin/http /usr/local/bin/http
@@ -149,7 +151,6 @@ COPY --from=dev ${VIRTUAL_ENV} ${VIRTUAL_ENV}
 # if test dependencies start to negatively impact deployment environment/size.
 COPY pyproject.toml /workspace/pyproject.toml
 COPY container/deps/vllm /workspace/container/deps/vllm
-COPY python-wheel/python /workspace/python-wheel/python
 # Add library for KV routing
 COPY --from=dev ${VLLM_KV_CAPI_PATH} ${VLLM_KV_CAPI_PATH}
 # Copy minimal set of files for deployment/examples

--- a/runtime/rust/docker-compose.yml
+++ b/runtime/rust/docker-compose.yml
--- a/examples/python_rs/llm/vllm/common/client.py
+++ b/examples/python_rs/llm/vllm/common/client.py
@@ -18,7 +18,8 @@ import argparse
 import asyncio
 import uvloop
-from triton_distributed_rs import DistributedRuntime, triton_worker
+from triton_distributed.runtime import DistributedRuntime, triton_worker
 from .protocol import Request

--- a/examples/python_rs/llm/vllm/disaggregated/decode_worker.py
+++ b/examples/python_rs/llm/vllm/disaggregated/decode_worker.py
@@ -24,7 +24,6 @@ from common.base_engine import BaseVllmEngine
 from common.chat_processor import ProcessMixIn
 from common.parser import parse_vllm_args
 from common.protocol import PrefillRequest
-from triton_distributed_rs import DistributedRuntime, triton_endpoint, triton_worker
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.openai.protocol import (
    ChatCompletionRequest,
@@ -32,6 +31,12 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.logger import logger as vllm_logger
+from triton_distributed.runtime import (
+    DistributedRuntime,
+    triton_endpoint,
+    triton_worker,
+)
 class VllmDecodeEngine(BaseVllmEngine, ProcessMixIn):
    """

--- a/examples/python_rs/llm/vllm/disaggregated/prefill_worker.py
+++ b/examples/python_rs/llm/vllm/disaggregated/prefill_worker.py
@@ -21,10 +21,15 @@ import vllm
 from common.base_engine import BaseVllmEngine
 from common.parser import parse_vllm_args
 from common.protocol import PrefillRequest, PrefillResponse
-from triton_distributed_rs import DistributedRuntime, triton_endpoint, triton_worker
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.logger import logger as vllm_logger
+from triton_distributed.runtime import (
+    DistributedRuntime,
+    triton_endpoint,
+    triton_worker,
+)
 class VllmPrefillEngine(BaseVllmEngine):
    """