"lib/vscode:/vscode.git/clone" did not exist on "7f136e29c1c697676226ccbefbcbafd7c70dbb58"
Commit 08fcd7e9 authored by Neelay Shah's avatar Neelay Shah Committed by GitHub
Browse files

refactor: move libs to lib dir


Signed-off-by: default avatarNeelay Shah <neelays@nvidia.com>
Co-authored-by: default avatarRyan McCormick <rmccormick@nvidia.com>
parent 0bfd9a76
...@@ -122,7 +122,7 @@ $global:copyright_results = @{ ...@@ -122,7 +122,7 @@ $global:copyright_results = @{
$ignored_files = @('.clang-format', '.gitattributes', '.gitignore', '.gitkeep', '.patch', 'Cargo.lock', 'LICENSE', 'uv.lock', 'rust-toolchain.toml') $ignored_files = @('.clang-format', '.gitattributes', '.gitignore', '.gitkeep', '.patch', 'Cargo.lock', 'LICENSE', 'uv.lock', 'rust-toolchain.toml')
write-debug "<copyright-check> ignored_files = ['$($ignored_files -join "','")']." write-debug "<copyright-check> ignored_files = ['$($ignored_files -join "','")']."
$ignored_paths = @('.github', '.mypy_cache', '.pytest_cache', 'llm/rust/triton-llm/tests/data/sample-models') $ignored_paths = @('.github', '.mypy_cache', '.pytest_cache', 'lib/llm/tests/data/sample-models')
write-debug "<copyright-check> ignored_paths = ['$($ignored_paths -join "','")']." write-debug "<copyright-check> ignored_paths = ['$($ignored_paths -join "','")']."
$ignored_types = @('.bat', '.gif', '.ico', '.ipynb', '.jpg', '.jpeg', '.patch', '.png', '.pyc', '.pyi', '.rst', '.zip', '.md') $ignored_types = @('.bat', '.gif', '.ico', '.ipynb', '.jpg', '.jpeg', '.patch', '.png', '.pyc', '.pyi', '.rst', '.zip', '.md')
write-debug "<copyright-check> ignored_types = ['$($ignored_types -join "', '")']." write-debug "<copyright-check> ignored_types = ['$($ignored_types -join "', '")']."
......
...@@ -26,8 +26,8 @@ on: ...@@ -26,8 +26,8 @@ on:
branches: branches:
- main - main
paths: paths:
- 'runtime/rust/**' - 'lib/runtime/**'
- 'llm/rust/**' - 'lib/llm/**'
- 'applications/llm/tio/**' - 'applications/llm/tio/**'
- '**.rs' - '**.rs'
- 'Cargo.toml' - 'Cargo.toml'
...@@ -65,26 +65,26 @@ jobs: ...@@ -65,26 +65,26 @@ jobs:
- name: Set up Rust Toolchain Components - name: Set up Rust Toolchain Components
run: rustup component add rustfmt clippy run: rustup component add rustfmt clippy
- name: Run Cargo Check on runtime - name: Run Cargo Check on runtime
working-directory: runtime/rust working-directory: lib/runtime
run: cargo check --locked run: cargo check --locked
- name: Run Cargo Check on tio - name: Run Cargo Check on tio
working-directory: applications/llm/tio working-directory: applications/llm/tio
run: cargo check --locked run: cargo check --locked
- name: Verify Code Formatting - name: Verify Code Formatting
working-directory: runtime/rust working-directory: lib/runtime
run: cargo fmt -- --check run: cargo fmt -- --check
- name: Run Clippy Checks on runtime - name: Run Clippy Checks on runtime
working-directory: runtime/rust working-directory: lib/runtime
run: cargo clippy --no-deps --all-targets -- -D warnings run: cargo clippy --no-deps --all-targets -- -D warnings
- name: Run Clippy Checks on tio - name: Run Clippy Checks on tio
working-directory: applications/llm/tio working-directory: applications/llm/tio
run: cargo clippy --no-deps --all-targets -- -D warnings run: cargo clippy --no-deps --all-targets -- -D warnings
- name: Install and Run cargo-deny - name: Install and Run cargo-deny
working-directory: runtime/rust working-directory: lib/runtime
run: | run: |
cargo-deny --version || cargo install cargo-deny cargo-deny --version || cargo install cargo-deny@0.16.4
cargo-deny check --hide-inclusion-graph licenses cargo-deny check --hide-inclusion-graph licenses
timeout-minutes: 5 timeout-minutes: 5
- name: Run Unit Tests - name: Run Unit Tests
working-directory: runtime/rust working-directory: lib/runtime
run: cargo test --locked --all-targets run: cargo test --locked --all-targets
...@@ -43,7 +43,8 @@ repos: ...@@ -43,7 +43,8 @@ repos:
- id: codespell - id: codespell
additional_dependencies: [tomli] additional_dependencies: [tomli]
args: ["--toml", "pyproject.toml"] args: ["--toml", "pyproject.toml"]
exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$|.*tests/data/*) exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$|.*lib/llm/tests/data.*)
# More details about these pre-commit hooks here: # More details about these pre-commit hooks here:
# https://pre-commit.com/hooks.html # https://pre-commit.com/hooks.html
- repo: https://github.com/pre-commit/pre-commit-hooks - repo: https://github.com/pre-commit/pre-commit-hooks
......
...@@ -500,8 +500,8 @@ dependencies = [ ...@@ -500,8 +500,8 @@ dependencies = [
"serde_json", "serde_json",
"tokio", "tokio",
"tracing", "tracing",
"triton-distributed", "triton-distributed-llm",
"triton-llm", "triton-distributed-runtime",
] ]
[[package]] [[package]]
...@@ -3385,85 +3385,85 @@ dependencies = [ ...@@ -3385,85 +3385,85 @@ dependencies = [
] ]
[[package]] [[package]]
name = "triton-distributed" name = "triton-distributed-llm"
version = "0.2.0" version = "0.2.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"async-nats",
"async-once-cell",
"async-stream", "async-stream",
"async-trait", "async-trait",
"async_zmq", "axum 0.8.1",
"blake3", "blake3",
"bs62",
"bytes", "bytes",
"chrono", "chrono",
"derive-getters",
"derive_builder", "derive_builder",
"educe",
"either", "either",
"etcd-client", "erased-serde",
"figment",
"futures", "futures",
"humantime", "galil-seiferas",
"local-ip-address", "indexmap 2.7.1",
"log", "itertools 0.14.0",
"nid", "minijinja",
"nix", "minijinja-contrib",
"nuid",
"once_cell",
"prometheus", "prometheus",
"rand",
"regex", "regex",
"semver",
"serde", "serde",
"serde_json", "serde_json",
"socket2", "thiserror 2.0.11",
"thiserror 1.0.69", "tokenizers",
"tokio", "tokio",
"tokio-stream", "tokio-stream",
"tokio-util", "tokio-util",
"toktrie",
"toktrie_hf_tokenizers",
"tracing", "tracing",
"tracing-subscriber", "triton-distributed-runtime",
"unicode-segmentation",
"uuid", "uuid",
"validator", "validator",
"xxhash-rust", "xxhash-rust",
] ]
[[package]] [[package]]
name = "triton-llm" name = "triton-distributed-runtime"
version = "0.2.0" version = "0.2.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"async-nats",
"async-once-cell",
"async-stream", "async-stream",
"async-trait", "async-trait",
"axum 0.8.1", "async_zmq",
"blake3", "blake3",
"bs62",
"bytes", "bytes",
"chrono", "chrono",
"derive-getters",
"derive_builder", "derive_builder",
"educe",
"either", "either",
"erased-serde", "etcd-client",
"figment",
"futures", "futures",
"galil-seiferas", "humantime",
"indexmap 2.7.1", "local-ip-address",
"itertools 0.14.0", "log",
"minijinja", "nid",
"minijinja-contrib", "nix",
"nuid",
"once_cell",
"prometheus", "prometheus",
"rand",
"regex", "regex",
"semver",
"serde", "serde",
"serde_json", "serde_json",
"thiserror 2.0.11", "socket2",
"tokenizers", "thiserror 1.0.69",
"tokio", "tokio",
"tokio-stream", "tokio-stream",
"tokio-util", "tokio-util",
"toktrie",
"toktrie_hf_tokenizers",
"tracing", "tracing",
"triton-distributed", "tracing-subscriber",
"unicode-segmentation",
"uuid", "uuid",
"validator", "validator",
"xxhash-rust", "xxhash-rust",
......
...@@ -20,8 +20,8 @@ edition = "2021" ...@@ -20,8 +20,8 @@ edition = "2021"
[dependencies] [dependencies]
# local # local
triton-distributed = { path = "../../../runtime/rust" } triton-distributed-runtime = { path = "../../../lib/runtime" }
triton-llm = { path = "../../../llm/rust/triton-llm" } triton-distributed-llm = { path = "../../../lib/llm" }
# workspace - todo # workspace - todo
......
...@@ -25,7 +25,7 @@ ...@@ -25,7 +25,7 @@
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use triton_distributed::{ use triton_distributed_runtime::{
error, logging, error, logging,
traits::events::EventPublisher, traits::events::EventPublisher,
utils::{Duration, Instant}, utils::{Duration, Instant},
......
...@@ -4744,8 +4744,8 @@ dependencies = [ ...@@ -4744,8 +4744,8 @@ dependencies = [
"tokio-util", "tokio-util",
"tracing", "tracing",
"tracing-subscriber", "tracing-subscriber",
"triton-distributed", "triton-distributed-llm",
"triton-llm", "triton-distributed-runtime",
] ]
[[package]] [[package]]
...@@ -5153,86 +5153,86 @@ dependencies = [ ...@@ -5153,86 +5153,86 @@ dependencies = [
] ]
[[package]] [[package]]
name = "triton-distributed" name = "triton-distributed-llm"
version = "0.2.0" version = "0.2.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"async-nats",
"async-once-cell",
"async-stream", "async-stream",
"async-trait", "async-trait",
"async_zmq", "axum 0.8.1",
"blake3", "blake3",
"bs62",
"bytes", "bytes",
"chrono", "chrono",
"derive-getters",
"derive_builder", "derive_builder",
"educe",
"either", "either",
"etcd-client", "erased-serde",
"figment",
"futures", "futures",
"humantime", "galil-seiferas",
"local-ip-address", "indexmap 2.7.1",
"log", "itertools 0.14.0",
"nid", "minijinja",
"nix 0.29.0", "minijinja-contrib",
"nuid", "mistralrs",
"once_cell",
"prometheus", "prometheus",
"rand",
"regex", "regex",
"semver",
"serde", "serde",
"serde_json", "serde_json",
"socket2", "thiserror 2.0.11",
"thiserror 1.0.69", "tokenizers",
"tokio", "tokio",
"tokio-stream", "tokio-stream",
"tokio-util", "tokio-util",
"toktrie 0.6.28",
"toktrie_hf_tokenizers 0.6.28",
"tracing", "tracing",
"tracing-subscriber", "triton-distributed-runtime",
"unicode-segmentation",
"uuid 1.14.0", "uuid 1.14.0",
"validator", "validator",
"xxhash-rust", "xxhash-rust",
] ]
[[package]] [[package]]
name = "triton-llm" name = "triton-distributed-runtime"
version = "0.2.0" version = "0.2.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"async-nats",
"async-once-cell",
"async-stream", "async-stream",
"async-trait", "async-trait",
"axum 0.8.1", "async_zmq",
"blake3", "blake3",
"bs62",
"bytes", "bytes",
"chrono", "chrono",
"derive-getters",
"derive_builder", "derive_builder",
"educe",
"either", "either",
"erased-serde", "etcd-client",
"figment",
"futures", "futures",
"galil-seiferas", "humantime",
"indexmap 2.7.1", "local-ip-address",
"itertools 0.14.0", "log",
"minijinja", "nid",
"minijinja-contrib", "nix 0.29.0",
"mistralrs", "nuid",
"once_cell",
"prometheus", "prometheus",
"rand",
"regex", "regex",
"semver",
"serde", "serde",
"serde_json", "serde_json",
"thiserror 2.0.11", "socket2",
"tokenizers", "thiserror 1.0.69",
"tokio", "tokio",
"tokio-stream", "tokio-stream",
"tokio-util", "tokio-util",
"toktrie 0.6.28",
"toktrie_hf_tokenizers 0.6.28",
"tracing", "tracing",
"triton-distributed", "tracing-subscriber",
"unicode-segmentation",
"uuid 1.14.0", "uuid 1.14.0",
"validator", "validator",
"xxhash-rust", "xxhash-rust",
......
...@@ -21,9 +21,9 @@ authors = ["NVIDIA"] ...@@ -21,9 +21,9 @@ authors = ["NVIDIA"]
homepage = "https://github.com/triton-inference-server/triton_distributed" homepage = "https://github.com/triton-inference-server/triton_distributed"
[features] [features]
mistralrs = ["triton-llm/mistralrs"] mistralrs = ["triton-distributed-llm/mistralrs"]
cuda = ["triton-llm/cuda"] cuda = ["triton-distributed-llm/cuda"]
metal = ["triton-llm/metal"] metal = ["triton-distributed-llm/metal"]
[dependencies] [dependencies]
anyhow = "1" anyhow = "1"
...@@ -42,5 +42,5 @@ tokio = { version = "1", features = ["full"] } ...@@ -42,5 +42,5 @@ tokio = { version = "1", features = ["full"] }
tokio-util = { version = "0.7", features = ["codec", "net"] } tokio-util = { version = "0.7", features = ["codec", "net"] }
tracing = { version = "0.1" } tracing = { version = "0.1" }
tracing-subscriber = { version = "0.3", features = ["env-filter", "local-time", "json"] } tracing-subscriber = { version = "0.3", features = ["env-filter", "local-time", "json"] }
triton-distributed = { path = "../../../runtime/rust" } triton-distributed-runtime = { path = "../../../lib/runtime" }
triton-llm = { path = "../../../llm/rust/triton-llm" } triton-distributed-llm = { path = "../../../lib/llm" }
...@@ -13,10 +13,10 @@ ...@@ -13,10 +13,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
use triton_distributed::{ use triton_distributed_runtime::{
pipeline::network::Ingress, protocols::Endpoint, DistributedRuntime, Runtime, pipeline::network::Ingress, protocols::Endpoint, DistributedRuntime, Runtime,
}; };
use triton_llm::http::service::discovery::ModelEntry; use triton_distributed_llm::http::service::discovery::ModelEntry;
use crate::{EngineConfig, ENDPOINT_SCHEME}; use crate::{EngineConfig, ENDPOINT_SCHEME};
......
...@@ -15,8 +15,8 @@ ...@@ -15,8 +15,8 @@
use std::sync::Arc; use std::sync::Arc;
use triton_distributed::{DistributedRuntime, Runtime}; use triton_distributed_runtime::{DistributedRuntime, Runtime};
use triton_llm::http::service::{discovery, service_v2}; use triton_distributed_llm::http::service::{discovery, service_v2};
use crate::EngineConfig; use crate::EngineConfig;
......
...@@ -18,8 +18,8 @@ use std::{ ...@@ -18,8 +18,8 @@ use std::{
io::{ErrorKind, Read, Write}, io::{ErrorKind, Read, Write},
sync::Arc, sync::Arc,
}; };
use triton_distributed::{pipeline::Context, runtime::CancellationToken}; use triton_distributed_runtime::{pipeline::Context, runtime::CancellationToken};
use triton_llm::{ use triton_distributed_llm::{
protocols::openai::chat_completions::MessageRole, protocols::openai::chat_completions::MessageRole,
types::openai::chat_completions::{ types::openai::chat_completions::{
ChatCompletionRequest, OpenAIChatCompletionsStreamingEngine, ChatCompletionRequest, OpenAIChatCompletionsStreamingEngine,
......
...@@ -15,8 +15,8 @@ ...@@ -15,8 +15,8 @@
use std::path::PathBuf; use std::path::PathBuf;
use triton_distributed::{component::Client, DistributedRuntime}; use triton_distributed_runtime::{component::Client, DistributedRuntime};
use triton_llm::types::{ use triton_distributed_llm::types::{
openai::chat_completions::{ openai::chat_completions::{
ChatCompletionRequest, ChatCompletionResponseDelta, OpenAIChatCompletionsStreamingEngine, ChatCompletionRequest, ChatCompletionResponseDelta, OpenAIChatCompletionsStreamingEngine,
}, },
...@@ -68,7 +68,7 @@ pub enum EngineConfig { ...@@ -68,7 +68,7 @@ pub enum EngineConfig {
} }
pub async fn run( pub async fn run(
runtime: triton_distributed::Runtime, runtime: triton_distributed_runtime::Runtime,
in_opt: Input, in_opt: Input,
out_opt: Output, out_opt: Output,
flags: Flags, flags: Flags,
...@@ -138,7 +138,7 @@ pub async fn run( ...@@ -138,7 +138,7 @@ pub async fn run(
}; };
EngineConfig::StaticFull { EngineConfig::StaticFull {
service_name: model_name, service_name: model_name,
engine: triton_llm::engines::mistralrs::make_engine(&model_path).await?, engine: triton_distributed_llm::engines::mistralrs::make_engine(&model_path).await?,
} }
} }
}; };
......
...@@ -18,7 +18,7 @@ use std::env; ...@@ -18,7 +18,7 @@ use std::env;
use clap::Parser; use clap::Parser;
use tio::{Input, Output}; use tio::{Input, Output};
use triton_distributed::logging; use triton_distributed_runtime::logging;
const HELP: &str = r#" const HELP: &str = r#"
tio is a single binary that wires together the various inputs (http, text, network) and workers (network, engine), that runs the services. It is the simplest way to use triton-distributed locally. tio is a single binary that wires together the various inputs (http, text, network) and workers (network, engine), that runs the services. It is the simplest way to use triton-distributed locally.
...@@ -45,15 +45,15 @@ fn main() -> anyhow::Result<()> { ...@@ -45,15 +45,15 @@ fn main() -> anyhow::Result<()> {
logging::init(); logging::init();
// max_worker_threads and max_blocking_threads from env vars or config file. // max_worker_threads and max_blocking_threads from env vars or config file.
let rt_config = triton_distributed::RuntimeConfig::from_settings()?; let rt_config = triton_distributed_runtime::RuntimeConfig::from_settings()?;
// One per process. Wraps a Runtime with holds two tokio runtimes. // One per process. Wraps a Runtime with holds two tokio runtimes.
let worker = triton_distributed::Worker::from_config(rt_config)?; let worker = triton_distributed_runtime::Worker::from_config(rt_config)?;
worker.execute(tio_wrapper) worker.execute(tio_wrapper)
} }
async fn tio_wrapper(runtime: triton_distributed::Runtime) -> anyhow::Result<()> { async fn tio_wrapper(runtime: triton_distributed_runtime::Runtime) -> anyhow::Result<()> {
let mut in_opt = None; let mut in_opt = None;
let mut out_opt = None; let mut out_opt = None;
let args: Vec<String> = env::args().skip(1).collect(); let args: Vec<String> = env::args().skip(1).collect();
......
...@@ -18,14 +18,14 @@ use std::{sync::Arc, time::Duration}; ...@@ -18,14 +18,14 @@ use std::{sync::Arc, time::Duration};
use async_stream::stream; use async_stream::stream;
use async_trait::async_trait; use async_trait::async_trait;
use triton_distributed::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream}; use triton_distributed_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
use triton_distributed::pipeline::{Error, ManyOut, SingleIn}; use triton_distributed_runtime::pipeline::{Error, ManyOut, SingleIn};
use triton_distributed::protocols::annotated::Annotated; use triton_distributed_runtime::protocols::annotated::Annotated;
use triton_llm::protocols::openai::chat_completions::FinishReason; use triton_distributed_llm::protocols::openai::chat_completions::FinishReason;
use triton_llm::protocols::openai::chat_completions::{ use triton_distributed_llm::protocols::openai::chat_completions::{
ChatCompletionRequest, ChatCompletionResponseDelta, Content, ChatCompletionRequest, ChatCompletionResponseDelta, Content,
}; };
use triton_llm::types::openai::chat_completions::OpenAIChatCompletionsStreamingEngine; use triton_distributed_llm::types::openai::chat_completions::OpenAIChatCompletionsStreamingEngine;
/// How long to sleep between echoed tokens. /// How long to sleep between echoed tokens.
/// 50ms gives us 20 tok/s. /// 50ms gives us 20 tok/s.
......
...@@ -28,7 +28,6 @@ USER root ...@@ -28,7 +28,6 @@ USER root
RUN apt-get update && \ RUN apt-get update && \
apt-get install --no-install-recommends --yes gdb protobuf-compiler cmake libssl-dev pkg-config apt-get install --no-install-recommends --yes gdb protobuf-compiler cmake libssl-dev pkg-config
RUN curl https://sh.rustup.rs -sSf | bash -s -- -y RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}" ENV PATH="/root/.cargo/bin:${PATH}"
...@@ -131,12 +130,12 @@ RUN apt-get install tmux -y ...@@ -131,12 +130,12 @@ RUN apt-get install tmux -y
# Working directory # Working directory
WORKDIR /workspace WORKDIR /workspace
COPY runtime /workspace/runtime COPY lib/runtime /workspace/lib/runtime
RUN cd runtime/rust && \ RUN cd lib/runtime && \
cargo build --release --locked && cargo doc --no-deps cargo build --release --locked && cargo doc --no-deps
# Build OpenAI HTTP Service binaries # Build OpenAI HTTP Service binaries
COPY llm/rust /workspace/llm/rust COPY lib/llm /workspace/lib/llm
COPY examples/rust /workspace/examples/rust COPY examples/rust /workspace/examples/rust
RUN cd examples/rust && \ RUN cd examples/rust && \
cargo build --release && \ cargo build --release && \
...@@ -144,31 +143,30 @@ RUN cd examples/rust && \ ...@@ -144,31 +143,30 @@ RUN cd examples/rust && \
cp target/release/llmctl /usr/local/bin/ cp target/release/llmctl /usr/local/bin/
# Generate C bindings. Note that this is required for TRTLLM backend re-build # Generate C bindings. Note that this is required for TRTLLM backend re-build
COPY llm/rust /workspace/llm/rust COPY lib/bindings /workspace/lib/bindings
RUN cd llm/rust/ && \ RUN cd lib/bindings/c/ && \
cargo build --release --locked && cargo doc --no-deps cargo build --release --locked && cargo doc --no-deps
# Install uv, create virtualenv for general use, and build triton_distributed_rs wheel # Install uv, create virtualenv for general use, and build triton_distributed wheel
COPY python-wheel /workspace/python-wheel
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
RUN mkdir /opt/triton && \ RUN mkdir /opt/triton && \
uv venv /opt/triton/venv --python 3.12 && \ uv venv /opt/triton/venv --python 3.12 && \
source /opt/triton/venv/bin/activate && \ source /opt/triton/venv/bin/activate && \
cd python-wheel && \ cd lib/bindings/python && \
uv build && \ uv build && \
uv pip install dist/triton_distributed_rs*cp312*.whl uv pip install /workspace/lib/bindings/python/dist/triton_distributed*cp312*.whl
# Package the bindings # Package the bindings
RUN mkdir -p /opt/triton/llm_binding/wheels && \ RUN mkdir -p /opt/triton/bindings/wheels && \
mkdir /opt/triton/llm_binding/lib && \ mkdir /opt/triton/bindings/lib && \
cp python-wheel/dist/triton_distributed_rs*cp312*.whl /opt/triton/llm_binding/wheels/. && \ cp lib/bindings/python/dist/triton_distributed*cp312*.whl /opt/triton/bindings/wheels/. && \
cp llm/rust/target/release/libtriton_llm_capi.so /opt/triton/llm_binding/lib/. && \ cp lib/bindings/c/target/release/libtriton_distributed_llm_capi.so /opt/triton/bindings/lib/. && \
cp -r llm/rust/libtriton-llm/include /opt/triton/llm_binding/. cp -r lib/bindings/c/include /opt/triton/bindings/.
# Install triton_distributed_rs wheel globally in container for tests that # Install triton_distributed_runtime and triton_distributed_llm wheels globally in container for tests that
# currently run without virtual environment activated. # currently run without virtual environment activated.
# TODO: In future, we may use a virtualenv for everything and remove this. # TODO: In future, we may use a virtualenv for everything and remove this.
RUN pip install /opt/triton/llm_binding/wheels/triton_distributed_rs*cp312*.whl RUN pip install /opt/triton/bindings/wheels/triton_distributed*cp312*.whl
# Copy everything in after install steps to avoid re-running build/install # Copy everything in after install steps to avoid re-running build/install
# commands on unrelated changes in other dirs. # commands on unrelated changes in other dirs.
......
...@@ -63,21 +63,21 @@ RUN apt update -y && \ ...@@ -63,21 +63,21 @@ RUN apt update -y && \
apt install -y \ apt install -y \
build-essential \ build-essential \
protobuf-compiler \ protobuf-compiler \
cmake \ cmake \
libssl-dev \ libssl-dev \
pkg-config && \ pkg-config && \
curl https://sh.rustup.rs -sSf | bash -s -- -y curl https://sh.rustup.rs -sSf | bash -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}" ENV PATH="/root/.cargo/bin:${PATH}"
# Working directory # Working directory
WORKDIR /workspace WORKDIR /workspace
COPY runtime/rust /workspace/runtime/rust COPY lib/runtime /workspace/lib/runtime
RUN cd runtime/rust && \ RUN cd lib/runtime && \
cargo build --release --locked && cargo doc --no-deps cargo build --release --locked && cargo doc --no-deps
# Build OpenAI HTTP Service binaries # Build OpenAI HTTP Service binaries
COPY llm/rust /workspace/llm/rust COPY lib/llm /workspace/lib/llm
COPY examples/rust /workspace/examples/rust COPY examples/rust /workspace/examples/rust
RUN cd examples/rust && \ RUN cd examples/rust && \
cargo build --release && \ cargo build --release && \
...@@ -88,23 +88,25 @@ RUN cd examples/rust && \ ...@@ -88,23 +88,25 @@ RUN cd examples/rust && \
# COPY applications/... # COPY applications/...
# Generate C bindings for kv cache routing in vLLM # Generate C bindings for kv cache routing in vLLM
COPY llm/rust /workspace/llm/rust COPY lib/bindings /workspace/lib/bindings
RUN cd llm/rust/ && \ RUN cd lib/bindings/c && \
cargo build --release --locked && cargo doc --no-deps cargo build --release --locked && cargo doc --no-deps
# Build triton_distributed_rs wheel # Build triton_distributed wheel
COPY python-wheel /workspace/python-wheel RUN source /opt/triton/venv/bin/activate && \
RUN cd python-wheel && \ cd lib/bindings/python && \
uv build && \ uv build && \
uv pip install dist/triton_distributed_rs*cp312*.whl uv pip install /workspace/lib/bindings/python/dist/triton_distributed*cp312*.whl
# Package the bindings # Package the bindings
RUN mkdir -p /opt/triton/llm_binding/wheels && mkdir /opt/triton/llm_binding/lib RUN mkdir -p /opt/triton/bindings/wheels && \
RUN cp python-wheel/dist/triton_distributed_rs*cp312*.whl /opt/triton/llm_binding/wheels/. mkdir /opt/triton/bindings/lib && \
RUN cp llm/rust/target/release/libtriton_llm_capi.so /opt/triton/llm_binding/lib/. cp lib/bindings/python/dist/triton_distributed*cp312*.whl /opt/triton/bindings/wheels/. && \
RUN cp -r llm/rust/libtriton-llm/include /opt/triton/llm_binding/. cp lib/bindings/c/target/release/libtriton_distributed_llm_capi.so /opt/triton/bindings/lib/. && \
cp -r lib/bindings/c/include /opt/triton/bindings/.
# Tell vllm to use the Triton LLM C API for KV Cache Routing # Tell vllm to use the Triton LLM C API for KV Cache Routing
ENV VLLM_KV_CAPI_PATH="/opt/triton/llm_binding/lib/libtriton_llm_capi.so" ENV VLLM_KV_CAPI_PATH="/opt/triton/bindings/lib/libtriton_distributed_llm_capi.so"
# FIXME: Copy more specific folders in for dev/debug after directory restructure # FIXME: Copy more specific folders in for dev/debug after directory restructure
COPY . /workspace COPY . /workspace
...@@ -130,7 +132,7 @@ RUN apt update -y && \ ...@@ -130,7 +132,7 @@ RUN apt update -y && \
ENV VIRTUAL_ENV=/opt/triton/venv ENV VIRTUAL_ENV=/opt/triton/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true
ENV VLLM_KV_CAPI_PATH="/opt/triton/llm_binding/lib/libtriton_llm_capi.so" ENV VLLM_KV_CAPI_PATH="/opt/triton/bindings/lib/libtriton_distributed_llm_capi.so"
# Copy binaries # Copy binaries
COPY --from=dev /usr/local/bin/http /usr/local/bin/http COPY --from=dev /usr/local/bin/http /usr/local/bin/http
...@@ -149,7 +151,6 @@ COPY --from=dev ${VIRTUAL_ENV} ${VIRTUAL_ENV} ...@@ -149,7 +151,6 @@ COPY --from=dev ${VIRTUAL_ENV} ${VIRTUAL_ENV}
# if test dependencies start to negatively impact deployment environment/size. # if test dependencies start to negatively impact deployment environment/size.
COPY pyproject.toml /workspace/pyproject.toml COPY pyproject.toml /workspace/pyproject.toml
COPY container/deps/vllm /workspace/container/deps/vllm COPY container/deps/vllm /workspace/container/deps/vllm
COPY python-wheel/python /workspace/python-wheel/python
# Add library for KV routing # Add library for KV routing
COPY --from=dev ${VLLM_KV_CAPI_PATH} ${VLLM_KV_CAPI_PATH} COPY --from=dev ${VLLM_KV_CAPI_PATH} ${VLLM_KV_CAPI_PATH}
# Copy minimal set of files for deployment/examples # Copy minimal set of files for deployment/examples
......
...@@ -18,7 +18,8 @@ import argparse ...@@ -18,7 +18,8 @@ import argparse
import asyncio import asyncio
import uvloop import uvloop
from triton_distributed_rs import DistributedRuntime, triton_worker
from triton_distributed.runtime import DistributedRuntime, triton_worker
from .protocol import Request from .protocol import Request
......
...@@ -24,7 +24,6 @@ from common.base_engine import BaseVllmEngine ...@@ -24,7 +24,6 @@ from common.base_engine import BaseVllmEngine
from common.chat_processor import ProcessMixIn from common.chat_processor import ProcessMixIn
from common.parser import parse_vllm_args from common.parser import parse_vllm_args
from common.protocol import PrefillRequest from common.protocol import PrefillRequest
from triton_distributed_rs import DistributedRuntime, triton_endpoint, triton_worker
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest, ChatCompletionRequest,
...@@ -32,6 +31,12 @@ from vllm.entrypoints.openai.protocol import ( ...@@ -32,6 +31,12 @@ from vllm.entrypoints.openai.protocol import (
) )
from vllm.logger import logger as vllm_logger from vllm.logger import logger as vllm_logger
from triton_distributed.runtime import (
DistributedRuntime,
triton_endpoint,
triton_worker,
)
class VllmDecodeEngine(BaseVllmEngine, ProcessMixIn): class VllmDecodeEngine(BaseVllmEngine, ProcessMixIn):
""" """
......
...@@ -21,10 +21,15 @@ import vllm ...@@ -21,10 +21,15 @@ import vllm
from common.base_engine import BaseVllmEngine from common.base_engine import BaseVllmEngine
from common.parser import parse_vllm_args from common.parser import parse_vllm_args
from common.protocol import PrefillRequest, PrefillResponse from common.protocol import PrefillRequest, PrefillResponse
from triton_distributed_rs import DistributedRuntime, triton_endpoint, triton_worker
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.logger import logger as vllm_logger from vllm.logger import logger as vllm_logger
from triton_distributed.runtime import (
DistributedRuntime,
triton_endpoint,
triton_worker,
)
class VllmPrefillEngine(BaseVllmEngine): class VllmPrefillEngine(BaseVllmEngine):
""" """
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment