Commit ffc6dde1 authored by Ryan Olson's avatar Ryan Olson Committed by GitHub
Browse files

feat: OpenAI compatible http service (#123)


Signed-off-by: default avatarRyan Olson <ryanolson@users.noreply.github.com>
Co-authored-by: default avatarRyan McCormick <rmccormick@nvidia.com>
Co-authored-by: default avatarNeelay Shah <neelays@nvidia.com>
parent 9d6643b7
......@@ -43,7 +43,7 @@ repos:
- id: codespell
additional_dependencies: [tomli]
args: ["--toml", "pyproject.toml"]
exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$)
exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$|.*tests/data/replays.*)
# More details about these pre-commit hooks here:
# https://pre-commit.com/hooks.html
- repo: https://github.com/pre-commit/pre-commit-hooks
......
This diff is collapsed.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[workspace]
members = [
"triton-llm",
]
resolver = "2"
[workspace.package]
version = "0.1.3"
edition = "2021"
authors = ["NVIDIA"]
license = "Apache-2.0"
homepage = "https://github.com/triton-inference-server/triton_distributed"
repository = "https://github.com/triton-inference-server/triton_distributed"
[workspace.dependencies]
# local or crates.io
triton-distributed = { version = "0.1.3", path = "../../runtime/rust" }
# crates.io
anyhow = { version = "1" }
async-stream = { version = "0.3" }
async-trait = { version = "0.1" }
bytes = "1"
derive_builder = "0.20"
futures = "0.3"
serde = { version = "1", features = ["derive"] }
thiserror = { version = "2.0.11" }
tokio = { version = "1", features = ["full"] }
tokio-stream = { version = "0.1" }
tokio-util = { version = "0.7", features = ["codec", "net"] }
tracing = { version = "0.1" }
validator = { version = "0.20.0", features = ["derive"] }
uuid = { version = "1", features = ["v4", "serde"] }
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[package]
name = "triton-llm"
version.workspace = true
edition.workspace = true
authors.workspace = true
license.workspace = true
homepage.workspace = true
[dependencies]
# repo
triton-distributed = { workspace = true }
# workspace
anyhow = { workspace = true }
async-stream = { workspace = true }
async-trait = { workspace = true }
bytes = { workspace = true }
derive_builder = {workspace = true }
futures = { workspace = true }
serde = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true }
tokio-stream = { workspace = true }
tokio-util = { workspace = true }
tracing = { workspace = true }
validator = { workspace = true }
uuid = { workspace = true }
# protocols
chrono = { version = "0.4" }
serde_json = { version = "1" }
regex = "1"
unicode-segmentation = "1.12"
# http-service
axum = "0.8"
prometheus = { version = "0.13" }
[dev-dependencies]
insta = { version = "1.41", features = ["glob", "json", "redactions"]}
proptest = "1.5.0"
reqwest = { version = "0.12", default-features = false, features = ["json", "stream", "rustls-tls"] }
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod service;
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! HTTP Service for Nova LLM
//!
//! The primary purpose of this crate is to service the nova-llm-protocols via OpenAI compatible HTTP endpoints. This component
//! is meant to be a gateway/ingress into the Nova LLM Distributed Runtime.
//!
//! In order to create a common pattern, the HttpService forwards the incoming OAI Chat Request or OAI Completion Request to the
//! to a model-specific engines. The engines can be attached and detached dynamically using the [`ModelManager`].
//!
//! Note: All requests, whether the client requests `stream=true` or `stream=false`, are propagated downstream as `stream=true`.
//! This enables use to handle only 1 pattern of request-response in the downstream services. Non-streaming user requests are
//! aggregated by the HttpService and returned as a single response.
//!
//! TODO(): Add support for model-specific metadata and status. Status will allow us to return a 503 when the model is supposed
//! to be ready, but there is a problem with the model.
//!
//! The [`service::HttpService`] can be further extended to host any [`axum::Router`] using the [`service::HttpServiceBuilder`].
mod openai;
pub mod error;
pub mod metrics;
pub mod service_v2;
// #[cfg(feature = "py3")]
// pub mod py3;
pub use async_trait::async_trait;
pub use axum;
pub use error::ServiceHttpError;
pub use metrics::Metrics;
use crate::types::openai::{
chat_completions::OpenAIChatCompletionsStreamingEngine,
completions::OpenAICompletionsStreamingEngine,
};
use std::{
collections::HashMap,
sync::{Arc, Mutex},
};
#[derive(Clone)]
pub struct ModelManager {
state: Arc<DeploymentState>,
}
impl Default for ModelManager {
fn default() -> Self {
Self::new()
}
}
impl ModelManager {
pub fn new() -> Self {
let state = Arc::new(DeploymentState::new());
Self { state }
}
pub fn state(&self) -> Arc<DeploymentState> {
self.state.clone()
}
pub fn has_model_any(&self, model: &str) -> bool {
self.state
.chat_completion_engines
.lock()
.unwrap()
.contains(model)
|| self
.state
.completion_engines
.lock()
.unwrap()
.contains(model)
}
pub fn list_chat_completions_models(&self) -> Vec<String> {
self.state.chat_completion_engines.lock().unwrap().list()
}
pub fn list_completions_models(&self) -> Vec<String> {
self.state.completion_engines.lock().unwrap().list()
}
pub fn add_completions_model(
&self,
model: &str,
engine: OpenAICompletionsStreamingEngine,
) -> Result<(), ServiceHttpError> {
let mut clients = self.state.completion_engines.lock().unwrap();
clients.add(model, engine)
}
pub fn add_chat_completions_model(
&self,
model: &str,
engine: OpenAIChatCompletionsStreamingEngine,
) -> Result<(), ServiceHttpError> {
let mut clients = self.state.chat_completion_engines.lock().unwrap();
clients.add(model, engine)
}
pub fn remove_completions_model(&self, model: &str) -> Result<(), ServiceHttpError> {
let mut clients = self.state.completion_engines.lock().unwrap();
clients.remove(model)
}
pub fn remove_chat_completions_model(&self, model: &str) -> Result<(), ServiceHttpError> {
let mut clients = self.state.chat_completion_engines.lock().unwrap();
clients.remove(model)
}
/// Get the Prometheus [`Metrics`] object which tracks request counts and inflight requests
pub fn metrics(&self) -> Arc<Metrics> {
self.state.metrics.clone()
}
}
struct ModelEngines<E> {
/// Optional default model name
default: Option<String>,
engines: HashMap<String, E>,
}
impl<E> Default for ModelEngines<E> {
fn default() -> Self {
Self {
default: None,
engines: HashMap::new(),
}
}
}
impl<E> ModelEngines<E> {
#[allow(dead_code)]
fn set_default(&mut self, model: &str) {
self.default = Some(model.to_string());
}
#[allow(dead_code)]
fn clear_default(&mut self) {
self.default = None;
}
fn add(&mut self, model: &str, engine: E) -> Result<(), ServiceHttpError> {
if self.engines.contains_key(model) {
return Err(ServiceHttpError::ModelAlreadyExists(model.to_string()));
}
self.engines.insert(model.to_string(), engine);
Ok(())
}
fn remove(&mut self, model: &str) -> Result<(), ServiceHttpError> {
if self.engines.remove(model).is_none() {
return Err(ServiceHttpError::ModelNotFound(model.to_string()));
}
Ok(())
}
fn get(&self, model: &str) -> Option<&E> {
self.engines.get(model)
}
fn contains(&self, model: &str) -> bool {
self.engines.contains_key(model)
}
fn list(&self) -> Vec<String> {
self.engines.keys().map(|k| k.to_owned()).collect()
}
}
/// The DeploymentState is a global state that is shared across all the workers
/// this provides set of known clients to Engines
pub struct DeploymentState {
completion_engines: Arc<Mutex<ModelEngines<OpenAICompletionsStreamingEngine>>>,
chat_completion_engines: Arc<Mutex<ModelEngines<OpenAIChatCompletionsStreamingEngine>>>,
metrics: Arc<Metrics>,
}
impl DeploymentState {
fn new() -> Self {
Self {
completion_engines: Arc::new(Mutex::new(ModelEngines::default())),
chat_completion_engines: Arc::new(Mutex::new(ModelEngines::default())),
metrics: Arc::new(Metrics::default()),
}
}
fn get_completions_engine(
&self,
model: &str,
) -> Result<OpenAICompletionsStreamingEngine, ServiceHttpError> {
self.completion_engines
.lock()
.unwrap()
.get(model)
.cloned()
.ok_or(ServiceHttpError::ModelNotFound(model.to_string()))
}
fn get_chat_completions_engine(
&self,
model: &str,
) -> Result<OpenAIChatCompletionsStreamingEngine, ServiceHttpError> {
self.chat_completion_engines
.lock()
.unwrap()
.get(model)
.cloned()
.ok_or(ServiceHttpError::ModelNotFound(model.to_string()))
}
}
/// Documentation for a route
#[derive(Debug)]
pub struct RouteDoc {
method: axum::http::Method,
path: String,
}
impl std::fmt::Display for RouteDoc {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "{} {}", self.method, self.path)
}
}
impl RouteDoc {
pub fn new<T: Into<String>>(method: axum::http::Method, path: T) -> Self {
RouteDoc {
method,
path: path.into(),
}
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use thiserror::Error;
#[derive(Debug, Error)]
pub enum ServiceHttpError {
#[error("Model not found: {0}")]
ModelNotFound(String),
#[error("Model already exists: {0}")]
ModelAlreadyExists(String),
}
/// Implementation of the Completion Engines served by the HTTP service should
/// map their custom errors to to this error type if they wish to return error
/// codes besides 500.
#[derive(Debug, Error)]
#[error("HTTP Error {code}: {message}")]
pub struct HttpError {
pub code: u16,
pub message: String,
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use axum::{extract::State, http::StatusCode, response::IntoResponse, routing::get, Router};
use prometheus::{Encoder, HistogramOpts, HistogramVec, IntCounterVec, IntGaugeVec, Opts};
use std::{sync::Arc, time::Instant};
pub use prometheus::Registry;
use super::{DeploymentState, RouteDoc};
/// Value for the `status` label in the request counter for successful requests
pub const REQUEST_STATUS_SUCCESS: &str = "success";
/// Value for the `status` label in the request counter if the request failed
pub const REQUEST_STATUS_ERROR: &str = "error";
/// Partial value for the `type` label in the request counter for streaming requests
pub const REQUEST_TYPE_STREAM: &str = "stream";
/// Partial value for the `type` label in the request counter for unary requests
pub const REQUEST_TYPE_UNARY: &str = "unary";
pub struct Metrics {
request_counter: IntCounterVec,
inflight_gauge: IntGaugeVec,
request_duration: HistogramVec,
}
/// RAII object for inflight gauge and request counters
/// If this object is dropped without calling `mark_ok`, then the request will increment
/// the request counter with the `status` label with [`REQUEST_STATUS_ERROR`]; otherwise, it will increment
/// the counter with `status` label [`REQUEST_STATUS_SUCCESS`]
pub struct InflightGuard {
metrics: Arc<Metrics>,
model: String,
endpoint: Endpoint,
request_type: RequestType,
status: Status,
timer: Instant,
}
/// Requests will be logged by the type of endpoint hit
/// This will include llamastack in the future
pub enum Endpoint {
/// OAI Completions
Completions,
/// OAI Chat Completions
ChatCompletions,
}
/// Metrics for the HTTP service
pub enum RequestType {
/// SingleIn / SingleOut
Unary,
/// SingleIn / ManyOut
Stream,
}
/// Status
pub enum Status {
Success,
Error,
}
impl Default for Metrics {
fn default() -> Self {
Self::new("nv_llm")
}
}
impl Metrics {
/// Create Metrics with the given prefix
/// The following metrics will be created:
/// - `{prefix}_http_service_requests_total` - IntCounterVec for the total number of requests processed
/// - `{prefix}_http_service_inflight_requests` - IntGaugeVec for the number of inflight requests
/// - `{prefix}_http_service_request_duration_seconds` - HistogramVec for the duration of requests
pub fn new(prefix: &str) -> Self {
let request_counter = IntCounterVec::new(
Opts::new(
format!("{}_http_service_requests_total", prefix),
"Total number of LLM requests processed",
),
&["model", "endpoint", "request_type", "status"],
)
.unwrap();
let inflight_gauge = IntGaugeVec::new(
Opts::new(
format!("{}_http_service_inflight_requests", prefix),
"Number of inflight requests",
),
&["model"],
)
.unwrap();
let buckets = vec![0.0, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0];
let request_duration = HistogramVec::new(
HistogramOpts::new(
format!("{}_http_service_request_duration_seconds", prefix),
"Duration of LLM requests",
)
.buckets(buckets),
&["model"],
)
.unwrap();
Metrics {
request_counter,
inflight_gauge,
request_duration,
}
}
/// Get the number of successful requests for the given dimensions:
/// - model
/// - endpoint (completions/chat_completions)
/// - request type (unary/stream)
/// - status (success/error)
pub fn get_request_counter(
&self,
model: &str,
endpoint: &Endpoint,
request_type: &RequestType,
status: &Status,
) -> u64 {
self.request_counter
.with_label_values(&[
model,
endpoint.as_str(),
request_type.as_str(),
status.as_str(),
])
.get()
}
/// Increment the counter for requests for the given dimensions:
/// - model
/// - endpoint (completions/chat_completions)
/// - request type (unary/stream)
/// - status (success/error)
fn inc_request_counter(
&self,
model: &str,
endpoint: &Endpoint,
request_type: &RequestType,
status: &Status,
) {
self.request_counter
.with_label_values(&[
model,
endpoint.as_str(),
request_type.as_str(),
status.as_str(),
])
.inc()
}
/// Get the number if inflight requests for the given model
pub fn get_inflight_count(&self, model: &str) -> i64 {
self.inflight_gauge.with_label_values(&[model]).get()
}
fn inc_inflight_gauge(&self, model: &str) {
self.inflight_gauge.with_label_values(&[model]).inc()
}
fn dec_inflight_gauge(&self, model: &str) {
self.inflight_gauge.with_label_values(&[model]).dec()
}
pub fn register(&self, registry: &Registry) -> Result<(), prometheus::Error> {
registry.register(Box::new(self.request_counter.clone()))?;
registry.register(Box::new(self.inflight_gauge.clone()))?;
registry.register(Box::new(self.request_duration.clone()))?;
Ok(())
}
}
impl DeploymentState {
/// Create a new [`InflightGuard`] for the given model and annotate if its a streaming request,
/// and the kind of endpoint that was hit
///
/// The [`InflightGuard`] is an RAII object will handle incrementing the inflight gauge and
/// request counters.
pub fn create_inflight_guard(
&self,
model: &str,
endpoint: Endpoint,
streaming: bool,
) -> InflightGuard {
let request_type = if streaming {
RequestType::Stream
} else {
RequestType::Unary
};
InflightGuard::new(
self.metrics.clone(),
model.to_string(),
endpoint,
request_type,
)
}
}
impl InflightGuard {
fn new(
metrics: Arc<Metrics>,
model: String,
endpoint: Endpoint,
request_type: RequestType,
) -> Self {
// Start the timer
let timer = Instant::now();
// Increment the inflight gauge when the guard is created
metrics.inc_inflight_gauge(&model);
// Return the RAII Guard
InflightGuard {
metrics,
model,
endpoint,
request_type,
status: Status::Error,
timer,
}
}
pub(crate) fn mark_ok(&mut self) {
self.status = Status::Success;
}
}
impl Drop for InflightGuard {
fn drop(&mut self) {
// Decrement the gauge when the guard is dropped
self.metrics.dec_inflight_gauge(&self.model);
// the frequency on incrementing the full request counter is relatively low
// if we were incrementing the counter on every forward pass, we'd use static CounterVec or
// discrete counter object without the more costly lookup required for the following calls
self.metrics.inc_request_counter(
&self.model,
&self.endpoint,
&self.request_type,
&self.status,
);
// Record the duration of the request
self.metrics
.request_duration
.with_label_values(&[&self.model])
.observe(self.timer.elapsed().as_secs_f64());
}
}
impl std::fmt::Display for Endpoint {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Endpoint::Completions => write!(f, "completions"),
Endpoint::ChatCompletions => write!(f, "chat_completions"),
}
}
}
impl Endpoint {
pub fn as_str(&self) -> &'static str {
match self {
Endpoint::Completions => "completions",
Endpoint::ChatCompletions => "chat_completions",
}
}
}
impl RequestType {
pub fn as_str(&self) -> &'static str {
match self {
RequestType::Unary => REQUEST_TYPE_UNARY,
RequestType::Stream => REQUEST_TYPE_STREAM,
}
}
}
impl Status {
pub fn as_str(&self) -> &'static str {
match self {
Status::Success => REQUEST_STATUS_SUCCESS,
Status::Error => REQUEST_STATUS_ERROR,
}
}
}
/// Create a new router with the given path
pub fn router(registry: Registry, path: Option<String>) -> (Vec<RouteDoc>, Router) {
let registry = Arc::new(registry);
let path = path.unwrap_or_else(|| "/metrics".to_string());
let doc = RouteDoc::new(axum::http::Method::GET, &path);
let route = Router::new()
.route(&path, get(handler_metrics))
.with_state(registry);
(vec![doc], route)
}
/// Metrics Handler
async fn handler_metrics(State(registry): State<Arc<Registry>>) -> impl IntoResponse {
let encoder = prometheus::TextEncoder::new();
let metric_families = registry.gather();
let mut buffer = vec![];
if encoder.encode(&metric_families, &mut buffer).is_err() {
return (
StatusCode::INTERNAL_SERVER_ERROR,
"Failed to encode metrics",
)
.into_response();
}
let metrics = match String::from_utf8(buffer) {
Ok(metrics) => metrics,
Err(_) => {
return (
StatusCode::INTERNAL_SERVER_ERROR,
"Failed to encode metrics",
)
.into_response()
}
};
(StatusCode::OK, metrics).into_response()
}
This diff is collapsed.
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use super::metrics;
use super::ModelManager;
use derive_builder::Builder;
use tokio_util::sync::CancellationToken;
#[derive(Clone)]
pub struct HttpService {
models: ModelManager,
router: axum::Router,
port: u16,
}
#[derive(Clone, Builder)]
#[builder(build_fn(private, name = "build_internal"))]
pub struct HttpServiceConfig {
#[builder(default = "8787")]
port: u16,
// #[builder(default)]
// custom: Vec<axum::Router>
#[builder(default = "true")]
enable_chat_endpoints: bool,
#[builder(default = "true")]
enable_cmpl_endpoints: bool,
}
impl HttpService {
pub fn builder() -> HttpServiceConfigBuilder {
HttpServiceConfigBuilder::default()
}
pub fn model_manager(&self) -> &ModelManager {
&self.models
}
pub async fn run(&self, cancel_token: CancellationToken) -> anyhow::Result<()> {
let address = format!("0.0.0.0:{}", self.port);
tracing::info!(address, "Starting HTTP service on: {address}");
let listener = tokio::net::TcpListener::bind(address.as_str())
.await
.unwrap_or_else(|_| panic!("could not bind to address: {address}"));
let router = self.router.clone();
let observer = cancel_token.child_token();
Ok(axum::serve(listener, router)
.with_graceful_shutdown(observer.cancelled_owned())
.await
.inspect_err(|_| cancel_token.cancel())?)
}
}
impl HttpServiceConfigBuilder {
pub fn build(self) -> Result<HttpService, anyhow::Error> {
let config = self.build_internal()?;
let model_manager = ModelManager::new();
// enable prometheus metrics
let registry = metrics::Registry::new();
model_manager.metrics().register(&registry)?;
let mut router = axum::Router::new();
let mut all_docs = Vec::new();
let mut routes = vec![
metrics::router(registry, None),
super::openai::list_models_router(model_manager.state(), None),
];
if config.enable_chat_endpoints {
routes.push(super::openai::completions_router(
model_manager.state(),
None,
));
}
if config.enable_cmpl_endpoints {
routes.push(super::openai::chat_completions_router(
model_manager.state(),
None,
));
}
// for (route_docs, route) in routes.into_iter().chain(self.routes.into_iter()) {
// router = router.merge(route);
// all_docs.extend(route_docs);
// }
for (route_docs, route) in routes.into_iter() {
router = router.merge(route);
all_docs.extend(route_docs);
}
Ok(HttpService {
models: model_manager,
router,
port: config.port,
})
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! # Triton LLM
//!
//! The `triton-llm` crate is a Rust library that provides a set of traits and types for building
//! distributed LLM inference solutions.
pub mod http;
pub mod protocols;
pub mod types;
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct ForwardPassMetrics {
pub request_active_slots: u64,
pub request_total_slots: u64,
pub kv_active_blocks: u64,
pub kv_total_blocks: u64,
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment