Commit 08fcd7e9 authored by Neelay Shah's avatar Neelay Shah Committed by GitHub
Browse files

refactor: move libs to lib dir


Signed-off-by: default avatarNeelay Shah <neelays@nvidia.com>
Co-authored-by: default avatarRyan McCormick <rmccormick@nvidia.com>
parent 0bfd9a76
......@@ -122,7 +122,7 @@ $global:copyright_results = @{
$ignored_files = @('.clang-format', '.gitattributes', '.gitignore', '.gitkeep', '.patch', 'Cargo.lock', 'LICENSE', 'uv.lock', 'rust-toolchain.toml')
write-debug "<copyright-check> ignored_files = ['$($ignored_files -join "','")']."
$ignored_paths = @('.github', '.mypy_cache', '.pytest_cache', 'llm/rust/triton-llm/tests/data/sample-models')
$ignored_paths = @('.github', '.mypy_cache', '.pytest_cache', 'lib/llm/tests/data/sample-models')
write-debug "<copyright-check> ignored_paths = ['$($ignored_paths -join "','")']."
$ignored_types = @('.bat', '.gif', '.ico', '.ipynb', '.jpg', '.jpeg', '.patch', '.png', '.pyc', '.pyi', '.rst', '.zip', '.md')
write-debug "<copyright-check> ignored_types = ['$($ignored_types -join "', '")']."
......
......@@ -26,8 +26,8 @@ on:
branches:
- main
paths:
- 'runtime/rust/**'
- 'llm/rust/**'
- 'lib/runtime/**'
- 'lib/llm/**'
- 'applications/llm/tio/**'
- '**.rs'
- 'Cargo.toml'
......@@ -65,26 +65,26 @@ jobs:
- name: Set up Rust Toolchain Components
run: rustup component add rustfmt clippy
- name: Run Cargo Check on runtime
working-directory: runtime/rust
working-directory: lib/runtime
run: cargo check --locked
- name: Run Cargo Check on tio
working-directory: applications/llm/tio
run: cargo check --locked
- name: Verify Code Formatting
working-directory: runtime/rust
working-directory: lib/runtime
run: cargo fmt -- --check
- name: Run Clippy Checks on runtime
working-directory: runtime/rust
working-directory: lib/runtime
run: cargo clippy --no-deps --all-targets -- -D warnings
- name: Run Clippy Checks on tio
working-directory: applications/llm/tio
run: cargo clippy --no-deps --all-targets -- -D warnings
- name: Install and Run cargo-deny
working-directory: runtime/rust
working-directory: lib/runtime
run: |
cargo-deny --version || cargo install cargo-deny
cargo-deny --version || cargo install cargo-deny@0.16.4
cargo-deny check --hide-inclusion-graph licenses
timeout-minutes: 5
- name: Run Unit Tests
working-directory: runtime/rust
working-directory: lib/runtime
run: cargo test --locked --all-targets
......@@ -43,7 +43,8 @@ repos:
- id: codespell
additional_dependencies: [tomli]
args: ["--toml", "pyproject.toml"]
exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$|.*tests/data/*)
exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$|.*lib/llm/tests/data.*)
# More details about these pre-commit hooks here:
# https://pre-commit.com/hooks.html
- repo: https://github.com/pre-commit/pre-commit-hooks
......
......@@ -500,8 +500,8 @@ dependencies = [
"serde_json",
"tokio",
"tracing",
"triton-distributed",
"triton-llm",
"triton-distributed-llm",
"triton-distributed-runtime",
]
[[package]]
......@@ -3385,85 +3385,85 @@ dependencies = [
]
[[package]]
name = "triton-distributed"
name = "triton-distributed-llm"
version = "0.2.0"
dependencies = [
"anyhow",
"async-nats",
"async-once-cell",
"async-stream",
"async-trait",
"async_zmq",
"axum 0.8.1",
"blake3",
"bs62",
"bytes",
"chrono",
"derive-getters",
"derive_builder",
"educe",
"either",
"etcd-client",
"figment",
"erased-serde",
"futures",
"humantime",
"local-ip-address",
"log",
"nid",
"nix",
"nuid",
"once_cell",
"galil-seiferas",
"indexmap 2.7.1",
"itertools 0.14.0",
"minijinja",
"minijinja-contrib",
"prometheus",
"rand",
"regex",
"semver",
"serde",
"serde_json",
"socket2",
"thiserror 1.0.69",
"thiserror 2.0.11",
"tokenizers",
"tokio",
"tokio-stream",
"tokio-util",
"toktrie",
"toktrie_hf_tokenizers",
"tracing",
"tracing-subscriber",
"triton-distributed-runtime",
"unicode-segmentation",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "triton-llm"
name = "triton-distributed-runtime"
version = "0.2.0"
dependencies = [
"anyhow",
"async-nats",
"async-once-cell",
"async-stream",
"async-trait",
"axum 0.8.1",
"async_zmq",
"blake3",
"bs62",
"bytes",
"chrono",
"derive-getters",
"derive_builder",
"educe",
"either",
"erased-serde",
"etcd-client",
"figment",
"futures",
"galil-seiferas",
"indexmap 2.7.1",
"itertools 0.14.0",
"minijinja",
"minijinja-contrib",
"humantime",
"local-ip-address",
"log",
"nid",
"nix",
"nuid",
"once_cell",
"prometheus",
"rand",
"regex",
"semver",
"serde",
"serde_json",
"thiserror 2.0.11",
"tokenizers",
"socket2",
"thiserror 1.0.69",
"tokio",
"tokio-stream",
"tokio-util",
"toktrie",
"toktrie_hf_tokenizers",
"tracing",
"triton-distributed",
"unicode-segmentation",
"tracing-subscriber",
"uuid",
"validator",
"xxhash-rust",
......
......@@ -20,8 +20,8 @@ edition = "2021"
[dependencies]
# local
triton-distributed = { path = "../../../runtime/rust" }
triton-llm = { path = "../../../llm/rust/triton-llm" }
triton-distributed-runtime = { path = "../../../lib/runtime" }
triton-distributed-llm = { path = "../../../lib/llm" }
# workspace - todo
......
......@@ -25,7 +25,7 @@
use serde::{Deserialize, Serialize};
use triton_distributed::{
use triton_distributed_runtime::{
error, logging,
traits::events::EventPublisher,
utils::{Duration, Instant},
......
......@@ -4744,8 +4744,8 @@ dependencies = [
"tokio-util",
"tracing",
"tracing-subscriber",
"triton-distributed",
"triton-llm",
"triton-distributed-llm",
"triton-distributed-runtime",
]
[[package]]
......@@ -5153,86 +5153,86 @@ dependencies = [
]
[[package]]
name = "triton-distributed"
name = "triton-distributed-llm"
version = "0.2.0"
dependencies = [
"anyhow",
"async-nats",
"async-once-cell",
"async-stream",
"async-trait",
"async_zmq",
"axum 0.8.1",
"blake3",
"bs62",
"bytes",
"chrono",
"derive-getters",
"derive_builder",
"educe",
"either",
"etcd-client",
"figment",
"erased-serde",
"futures",
"humantime",
"local-ip-address",
"log",
"nid",
"nix 0.29.0",
"nuid",
"once_cell",
"galil-seiferas",
"indexmap 2.7.1",
"itertools 0.14.0",
"minijinja",
"minijinja-contrib",
"mistralrs",
"prometheus",
"rand",
"regex",
"semver",
"serde",
"serde_json",
"socket2",
"thiserror 1.0.69",
"thiserror 2.0.11",
"tokenizers",
"tokio",
"tokio-stream",
"tokio-util",
"toktrie 0.6.28",
"toktrie_hf_tokenizers 0.6.28",
"tracing",
"tracing-subscriber",
"triton-distributed-runtime",
"unicode-segmentation",
"uuid 1.14.0",
"validator",
"xxhash-rust",
]
[[package]]
name = "triton-llm"
name = "triton-distributed-runtime"
version = "0.2.0"
dependencies = [
"anyhow",
"async-nats",
"async-once-cell",
"async-stream",
"async-trait",
"axum 0.8.1",
"async_zmq",
"blake3",
"bs62",
"bytes",
"chrono",
"derive-getters",
"derive_builder",
"educe",
"either",
"erased-serde",
"etcd-client",
"figment",
"futures",
"galil-seiferas",
"indexmap 2.7.1",
"itertools 0.14.0",
"minijinja",
"minijinja-contrib",
"mistralrs",
"humantime",
"local-ip-address",
"log",
"nid",
"nix 0.29.0",
"nuid",
"once_cell",
"prometheus",
"rand",
"regex",
"semver",
"serde",
"serde_json",
"thiserror 2.0.11",
"tokenizers",
"socket2",
"thiserror 1.0.69",
"tokio",
"tokio-stream",
"tokio-util",
"toktrie 0.6.28",
"toktrie_hf_tokenizers 0.6.28",
"tracing",
"triton-distributed",
"unicode-segmentation",
"tracing-subscriber",
"uuid 1.14.0",
"validator",
"xxhash-rust",
......
......@@ -21,9 +21,9 @@ authors = ["NVIDIA"]
homepage = "https://github.com/triton-inference-server/triton_distributed"
[features]
mistralrs = ["triton-llm/mistralrs"]
cuda = ["triton-llm/cuda"]
metal = ["triton-llm/metal"]
mistralrs = ["triton-distributed-llm/mistralrs"]
cuda = ["triton-distributed-llm/cuda"]
metal = ["triton-distributed-llm/metal"]
[dependencies]
anyhow = "1"
......@@ -42,5 +42,5 @@ tokio = { version = "1", features = ["full"] }
tokio-util = { version = "0.7", features = ["codec", "net"] }
tracing = { version = "0.1" }
tracing-subscriber = { version = "0.3", features = ["env-filter", "local-time", "json"] }
triton-distributed = { path = "../../../runtime/rust" }
triton-llm = { path = "../../../llm/rust/triton-llm" }
triton-distributed-runtime = { path = "../../../lib/runtime" }
triton-distributed-llm = { path = "../../../lib/llm" }
......@@ -13,10 +13,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use triton_distributed::{
use triton_distributed_runtime::{
pipeline::network::Ingress, protocols::Endpoint, DistributedRuntime, Runtime,
};
use triton_llm::http::service::discovery::ModelEntry;
use triton_distributed_llm::http::service::discovery::ModelEntry;
use crate::{EngineConfig, ENDPOINT_SCHEME};
......
......@@ -15,8 +15,8 @@
use std::sync::Arc;
use triton_distributed::{DistributedRuntime, Runtime};
use triton_llm::http::service::{discovery, service_v2};
use triton_distributed_runtime::{DistributedRuntime, Runtime};
use triton_distributed_llm::http::service::{discovery, service_v2};
use crate::EngineConfig;
......
......@@ -18,8 +18,8 @@ use std::{
io::{ErrorKind, Read, Write},
sync::Arc,
};
use triton_distributed::{pipeline::Context, runtime::CancellationToken};
use triton_llm::{
use triton_distributed_runtime::{pipeline::Context, runtime::CancellationToken};
use triton_distributed_llm::{
protocols::openai::chat_completions::MessageRole,
types::openai::chat_completions::{
ChatCompletionRequest, OpenAIChatCompletionsStreamingEngine,
......
......@@ -15,8 +15,8 @@
use std::path::PathBuf;
use triton_distributed::{component::Client, DistributedRuntime};
use triton_llm::types::{
use triton_distributed_runtime::{component::Client, DistributedRuntime};
use triton_distributed_llm::types::{
openai::chat_completions::{
ChatCompletionRequest, ChatCompletionResponseDelta, OpenAIChatCompletionsStreamingEngine,
},
......@@ -68,7 +68,7 @@ pub enum EngineConfig {
}
pub async fn run(
runtime: triton_distributed::Runtime,
runtime: triton_distributed_runtime::Runtime,
in_opt: Input,
out_opt: Output,
flags: Flags,
......@@ -138,7 +138,7 @@ pub async fn run(
};
EngineConfig::StaticFull {
service_name: model_name,
engine: triton_llm::engines::mistralrs::make_engine(&model_path).await?,
engine: triton_distributed_llm::engines::mistralrs::make_engine(&model_path).await?,
}
}
};
......
......@@ -18,7 +18,7 @@ use std::env;
use clap::Parser;
use tio::{Input, Output};
use triton_distributed::logging;
use triton_distributed_runtime::logging;
const HELP: &str = r#"
tio is a single binary that wires together the various inputs (http, text, network) and workers (network, engine), that runs the services. It is the simplest way to use triton-distributed locally.
......@@ -45,15 +45,15 @@ fn main() -> anyhow::Result<()> {
logging::init();
// max_worker_threads and max_blocking_threads from env vars or config file.
let rt_config = triton_distributed::RuntimeConfig::from_settings()?;
let rt_config = triton_distributed_runtime::RuntimeConfig::from_settings()?;
// One per process. Wraps a Runtime with holds two tokio runtimes.
let worker = triton_distributed::Worker::from_config(rt_config)?;
let worker = triton_distributed_runtime::Worker::from_config(rt_config)?;
worker.execute(tio_wrapper)
}
async fn tio_wrapper(runtime: triton_distributed::Runtime) -> anyhow::Result<()> {
async fn tio_wrapper(runtime: triton_distributed_runtime::Runtime) -> anyhow::Result<()> {
let mut in_opt = None;
let mut out_opt = None;
let args: Vec<String> = env::args().skip(1).collect();
......
......@@ -18,14 +18,14 @@ use std::{sync::Arc, time::Duration};
use async_stream::stream;
use async_trait::async_trait;
use triton_distributed::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
use triton_distributed::pipeline::{Error, ManyOut, SingleIn};
use triton_distributed::protocols::annotated::Annotated;
use triton_llm::protocols::openai::chat_completions::FinishReason;
use triton_llm::protocols::openai::chat_completions::{
use triton_distributed_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
use triton_distributed_runtime::pipeline::{Error, ManyOut, SingleIn};
use triton_distributed_runtime::protocols::annotated::Annotated;
use triton_distributed_llm::protocols::openai::chat_completions::FinishReason;
use triton_distributed_llm::protocols::openai::chat_completions::{
ChatCompletionRequest, ChatCompletionResponseDelta, Content,
};
use triton_llm::types::openai::chat_completions::OpenAIChatCompletionsStreamingEngine;
use triton_distributed_llm::types::openai::chat_completions::OpenAIChatCompletionsStreamingEngine;
/// How long to sleep between echoed tokens.
/// 50ms gives us 20 tok/s.
......
......@@ -28,7 +28,6 @@ USER root
RUN apt-get update && \
apt-get install --no-install-recommends --yes gdb protobuf-compiler cmake libssl-dev pkg-config
RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}"
......@@ -131,12 +130,12 @@ RUN apt-get install tmux -y
# Working directory
WORKDIR /workspace
COPY runtime /workspace/runtime
RUN cd runtime/rust && \
COPY lib/runtime /workspace/lib/runtime
RUN cd lib/runtime && \
cargo build --release --locked && cargo doc --no-deps
# Build OpenAI HTTP Service binaries
COPY llm/rust /workspace/llm/rust
COPY lib/llm /workspace/lib/llm
COPY examples/rust /workspace/examples/rust
RUN cd examples/rust && \
cargo build --release && \
......@@ -144,31 +143,30 @@ RUN cd examples/rust && \
cp target/release/llmctl /usr/local/bin/
# Generate C bindings. Note that this is required for TRTLLM backend re-build
COPY llm/rust /workspace/llm/rust
RUN cd llm/rust/ && \
COPY lib/bindings /workspace/lib/bindings
RUN cd lib/bindings/c/ && \
cargo build --release --locked && cargo doc --no-deps
# Install uv, create virtualenv for general use, and build triton_distributed_rs wheel
COPY python-wheel /workspace/python-wheel
# Install uv, create virtualenv for general use, and build triton_distributed wheel
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
RUN mkdir /opt/triton && \
uv venv /opt/triton/venv --python 3.12 && \
source /opt/triton/venv/bin/activate && \
cd python-wheel && \
cd lib/bindings/python && \
uv build && \
uv pip install dist/triton_distributed_rs*cp312*.whl
uv pip install /workspace/lib/bindings/python/dist/triton_distributed*cp312*.whl
# Package the bindings
RUN mkdir -p /opt/triton/llm_binding/wheels && \
mkdir /opt/triton/llm_binding/lib && \
cp python-wheel/dist/triton_distributed_rs*cp312*.whl /opt/triton/llm_binding/wheels/. && \
cp llm/rust/target/release/libtriton_llm_capi.so /opt/triton/llm_binding/lib/. && \
cp -r llm/rust/libtriton-llm/include /opt/triton/llm_binding/.
RUN mkdir -p /opt/triton/bindings/wheels && \
mkdir /opt/triton/bindings/lib && \
cp lib/bindings/python/dist/triton_distributed*cp312*.whl /opt/triton/bindings/wheels/. && \
cp lib/bindings/c/target/release/libtriton_distributed_llm_capi.so /opt/triton/bindings/lib/. && \
cp -r lib/bindings/c/include /opt/triton/bindings/.
# Install triton_distributed_rs wheel globally in container for tests that
# Install triton_distributed_runtime and triton_distributed_llm wheels globally in container for tests that
# currently run without virtual environment activated.
# TODO: In future, we may use a virtualenv for everything and remove this.
RUN pip install /opt/triton/llm_binding/wheels/triton_distributed_rs*cp312*.whl
RUN pip install /opt/triton/bindings/wheels/triton_distributed*cp312*.whl
# Copy everything in after install steps to avoid re-running build/install
# commands on unrelated changes in other dirs.
......
......@@ -72,12 +72,12 @@ ENV PATH="/root/.cargo/bin:${PATH}"
# Working directory
WORKDIR /workspace
COPY runtime/rust /workspace/runtime/rust
RUN cd runtime/rust && \
COPY lib/runtime /workspace/lib/runtime
RUN cd lib/runtime && \
cargo build --release --locked && cargo doc --no-deps
# Build OpenAI HTTP Service binaries
COPY llm/rust /workspace/llm/rust
COPY lib/llm /workspace/lib/llm
COPY examples/rust /workspace/examples/rust
RUN cd examples/rust && \
cargo build --release && \
......@@ -88,23 +88,25 @@ RUN cd examples/rust && \
# COPY applications/...
# Generate C bindings for kv cache routing in vLLM
COPY llm/rust /workspace/llm/rust
RUN cd llm/rust/ && \
COPY lib/bindings /workspace/lib/bindings
RUN cd lib/bindings/c && \
cargo build --release --locked && cargo doc --no-deps
# Build triton_distributed_rs wheel
COPY python-wheel /workspace/python-wheel
RUN cd python-wheel && \
# Build triton_distributed wheel
RUN source /opt/triton/venv/bin/activate && \
cd lib/bindings/python && \
uv build && \
uv pip install dist/triton_distributed_rs*cp312*.whl
uv pip install /workspace/lib/bindings/python/dist/triton_distributed*cp312*.whl
# Package the bindings
RUN mkdir -p /opt/triton/llm_binding/wheels && mkdir /opt/triton/llm_binding/lib
RUN cp python-wheel/dist/triton_distributed_rs*cp312*.whl /opt/triton/llm_binding/wheels/.
RUN cp llm/rust/target/release/libtriton_llm_capi.so /opt/triton/llm_binding/lib/.
RUN cp -r llm/rust/libtriton-llm/include /opt/triton/llm_binding/.
RUN mkdir -p /opt/triton/bindings/wheels && \
mkdir /opt/triton/bindings/lib && \
cp lib/bindings/python/dist/triton_distributed*cp312*.whl /opt/triton/bindings/wheels/. && \
cp lib/bindings/c/target/release/libtriton_distributed_llm_capi.so /opt/triton/bindings/lib/. && \
cp -r lib/bindings/c/include /opt/triton/bindings/.
# Tell vllm to use the Triton LLM C API for KV Cache Routing
ENV VLLM_KV_CAPI_PATH="/opt/triton/llm_binding/lib/libtriton_llm_capi.so"
ENV VLLM_KV_CAPI_PATH="/opt/triton/bindings/lib/libtriton_distributed_llm_capi.so"
# FIXME: Copy more specific folders in for dev/debug after directory restructure
COPY . /workspace
......@@ -130,7 +132,7 @@ RUN apt update -y && \
ENV VIRTUAL_ENV=/opt/triton/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true
ENV VLLM_KV_CAPI_PATH="/opt/triton/llm_binding/lib/libtriton_llm_capi.so"
ENV VLLM_KV_CAPI_PATH="/opt/triton/bindings/lib/libtriton_distributed_llm_capi.so"
# Copy binaries
COPY --from=dev /usr/local/bin/http /usr/local/bin/http
......@@ -149,7 +151,6 @@ COPY --from=dev ${VIRTUAL_ENV} ${VIRTUAL_ENV}
# if test dependencies start to negatively impact deployment environment/size.
COPY pyproject.toml /workspace/pyproject.toml
COPY container/deps/vllm /workspace/container/deps/vllm
COPY python-wheel/python /workspace/python-wheel/python
# Add library for KV routing
COPY --from=dev ${VLLM_KV_CAPI_PATH} ${VLLM_KV_CAPI_PATH}
# Copy minimal set of files for deployment/examples
......
......@@ -18,7 +18,8 @@ import argparse
import asyncio
import uvloop
from triton_distributed_rs import DistributedRuntime, triton_worker
from triton_distributed.runtime import DistributedRuntime, triton_worker
from .protocol import Request
......
......@@ -24,7 +24,6 @@ from common.base_engine import BaseVllmEngine
from common.chat_processor import ProcessMixIn
from common.parser import parse_vllm_args
from common.protocol import PrefillRequest
from triton_distributed_rs import DistributedRuntime, triton_endpoint, triton_worker
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
......@@ -32,6 +31,12 @@ from vllm.entrypoints.openai.protocol import (
)
from vllm.logger import logger as vllm_logger
from triton_distributed.runtime import (
DistributedRuntime,
triton_endpoint,
triton_worker,
)
class VllmDecodeEngine(BaseVllmEngine, ProcessMixIn):
"""
......
......@@ -21,10 +21,15 @@ import vllm
from common.base_engine import BaseVllmEngine
from common.parser import parse_vllm_args
from common.protocol import PrefillRequest, PrefillResponse
from triton_distributed_rs import DistributedRuntime, triton_endpoint, triton_worker
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.logger import logger as vllm_logger
from triton_distributed.runtime import (
DistributedRuntime,
triton_endpoint,
triton_worker,
)
class VllmPrefillEngine(BaseVllmEngine):
"""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment