feat: OpenAI compatible http service (#123)

Signed-off-by: Ryan Olson <ryanolson@users.noreply.github.com> Co-authored-by: Ryan McCormick <rmccormick@nvidia.com> Co-authored-by: Neelay Shah <neelays@nvidia.com>

feat: OpenAI compatible http service (#123)
Signed-off-by: Ryan Olson <ryanolson@users.noreply.github.com> Co-authored-by: Ryan McCormick <rmccormick@nvidia.com> Co-authored-by: Neelay Shah <neelays@nvidia.com>
ffc6dde1 · Ryan Olson · GitHub · 9d6643b7 · ffc6dde1 · ffc6dde1
Commit ffc6dde1 authored Feb 10, 2025 by Ryan Olson Committed by GitHub Feb 10, 2025
20 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -43,7 +43,7 @@ repos:
  - id: codespell
    additional_dependencies: [tomli]
    args: ["--toml", "pyproject.toml"]
-    exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$)
+    exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$|.*tests/data/replays.*)
 # More details about these pre-commit hooks here:
 # https://pre-commit.com/hooks.html
 - repo: https://github.com/pre-commit/pre-commit-hooks

--- a/llm/rust/Cargo.lock
+++ b/llm/rust/Cargo.lock
--- a/llm/rust/Cargo.toml
+++ b/llm/rust/Cargo.toml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[workspace]
+members = [
+    "triton-llm",
+]
+resolver = "2"
+
+[workspace.package]
+version = "0.1.3"
+edition = "2021"
+authors = ["NVIDIA"]
+license = "Apache-2.0"
+homepage = "https://github.com/triton-inference-server/triton_distributed"
+repository = "https://github.com/triton-inference-server/triton_distributed"
+
+
+[workspace.dependencies]
+# local or crates.io
+triton-distributed = { version = "0.1.3", path = "../../runtime/rust" }
+
+# crates.io
+anyhow = { version = "1" }
+async-stream = { version = "0.3" }
+async-trait = { version = "0.1" }
+bytes = "1"
+derive_builder = "0.20"
+futures = "0.3"
+serde = { version = "1", features = ["derive"] }
+thiserror = { version = "2.0.11" }
+tokio = { version = "1", features = ["full"] }
+tokio-stream = { version = "0.1" }
+tokio-util = { version = "0.7", features = ["codec", "net"] }
+tracing = { version = "0.1" }
+validator = { version = "0.20.0", features = ["derive"] }
+uuid = { version = "1", features = ["v4", "serde"] }
--- a/llm/rust/triton-llm/Cargo.toml
+++ b/llm/rust/triton-llm/Cargo.toml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[package]
+name = "triton-llm"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+license.workspace = true
+homepage.workspace = true
+
+[dependencies]
+
+# repo
+triton-distributed = { workspace = true }
+
+# workspace
+anyhow = { workspace = true }
+async-stream = { workspace = true }
+async-trait = { workspace = true }
+bytes = { workspace = true }
+derive_builder = {workspace = true }
+futures =  { workspace = true }
+serde = { workspace = true }
+thiserror = { workspace = true }
+tokio = { workspace = true }
+tokio-stream = { workspace = true }
+tokio-util = { workspace = true }
+tracing = { workspace = true }
+validator = { workspace = true }
+uuid = { workspace = true }
+
+# protocols
+chrono = { version = "0.4" }
+serde_json = { version = "1" }
+regex = "1"
+unicode-segmentation = "1.12"
+
+# http-service
+axum = "0.8"
+prometheus = { version = "0.13" }
+
+
+[dev-dependencies]
+insta = { version = "1.41", features = ["glob", "json", "redactions"]}
+proptest = "1.5.0"
+reqwest = { version = "0.12", default-features = false, features = ["json", "stream", "rustls-tls"] }
--- a/llm/rust/triton-llm/src/http.rs
+++ b/llm/rust/triton-llm/src/http.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pub mod service;
--- a/llm/rust/triton-llm/src/http/service.rs
+++ b/llm/rust/triton-llm/src/http/service.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! HTTP Service for Nova LLM
+//!
+//! The primary purpose of this crate is to service the nova-llm-protocols via OpenAI compatible HTTP endpoints. This component
+//! is meant to be a gateway/ingress into the Nova LLM Distributed Runtime.
+//!
+//! In order to create a common pattern, the HttpService forwards the incoming OAI Chat Request or OAI Completion Request to the
+//! to a model-specific engines.  The engines can be attached and detached dynamically using the [`ModelManager`].
+//!
+//! Note: All requests, whether the client requests `stream=true` or `stream=false`, are propagated downstream as `stream=true`.
+//! This enables use to handle only 1 pattern of request-response in the downstream services. Non-streaming user requests are
+//! aggregated by the HttpService and returned as a single response.
+//!
+//! TODO(): Add support for model-specific metadata and status. Status will allow us to return a 503 when the model is supposed
+//! to be ready, but there is a problem with the model.
+//!
+//! The [`service::HttpService`] can be further extended to host any [`axum::Router`] using the [`service::HttpServiceBuilder`].
+
+mod openai;
+
+pub mod error;
+pub mod metrics;
+pub mod service_v2;
+
+// #[cfg(feature = "py3")]
+// pub mod py3;
+
+pub use async_trait::async_trait;
+pub use axum;
+pub use error::ServiceHttpError;
+pub use metrics::Metrics;
+
+use crate::types::openai::{
+    chat_completions::OpenAIChatCompletionsStreamingEngine,
+    completions::OpenAICompletionsStreamingEngine,
+};
+use std::{
+    collections::HashMap,
+    sync::{Arc, Mutex},
+};
+
+#[derive(Clone)]
+pub struct ModelManager {
+    state: Arc<DeploymentState>,
+}
+
+impl Default for ModelManager {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ModelManager {
+    pub fn new() -> Self {
+        let state = Arc::new(DeploymentState::new());
+        Self { state }
+    }
+
+    pub fn state(&self) -> Arc<DeploymentState> {
+        self.state.clone()
+    }
+
+    pub fn has_model_any(&self, model: &str) -> bool {
+        self.state
+            .chat_completion_engines
+            .lock()
+            .unwrap()
+            .contains(model)
+            || self
+                .state
+                .completion_engines
+                .lock()
+                .unwrap()
+                .contains(model)
+    }
+
+    pub fn list_chat_completions_models(&self) -> Vec<String> {
+        self.state.chat_completion_engines.lock().unwrap().list()
+    }
+
+    pub fn list_completions_models(&self) -> Vec<String> {
+        self.state.completion_engines.lock().unwrap().list()
+    }
+
+    pub fn add_completions_model(
+        &self,
+        model: &str,
+        engine: OpenAICompletionsStreamingEngine,
+    ) -> Result<(), ServiceHttpError> {
+        let mut clients = self.state.completion_engines.lock().unwrap();
+        clients.add(model, engine)
+    }
+
+    pub fn add_chat_completions_model(
+        &self,
+        model: &str,
+        engine: OpenAIChatCompletionsStreamingEngine,
+    ) -> Result<(), ServiceHttpError> {
+        let mut clients = self.state.chat_completion_engines.lock().unwrap();
+        clients.add(model, engine)
+    }
+
+    pub fn remove_completions_model(&self, model: &str) -> Result<(), ServiceHttpError> {
+        let mut clients = self.state.completion_engines.lock().unwrap();
+        clients.remove(model)
+    }
+
+    pub fn remove_chat_completions_model(&self, model: &str) -> Result<(), ServiceHttpError> {
+        let mut clients = self.state.chat_completion_engines.lock().unwrap();
+        clients.remove(model)
+    }
+
+    /// Get the Prometheus [`Metrics`] object which tracks request counts and inflight requests
+    pub fn metrics(&self) -> Arc<Metrics> {
+        self.state.metrics.clone()
+    }
+}
+
+struct ModelEngines<E> {
+    /// Optional default model name
+    default: Option<String>,
+    engines: HashMap<String, E>,
+}
+
+impl<E> Default for ModelEngines<E> {
+    fn default() -> Self {
+        Self {
+            default: None,
+            engines: HashMap::new(),
+        }
+    }
+}
+
+impl<E> ModelEngines<E> {
+    #[allow(dead_code)]
+    fn set_default(&mut self, model: &str) {
+        self.default = Some(model.to_string());
+    }
+
+    #[allow(dead_code)]
+    fn clear_default(&mut self) {
+        self.default = None;
+    }
+
+    fn add(&mut self, model: &str, engine: E) -> Result<(), ServiceHttpError> {
+        if self.engines.contains_key(model) {
+            return Err(ServiceHttpError::ModelAlreadyExists(model.to_string()));
+        }
+        self.engines.insert(model.to_string(), engine);
+        Ok(())
+    }
+
+    fn remove(&mut self, model: &str) -> Result<(), ServiceHttpError> {
+        if self.engines.remove(model).is_none() {
+            return Err(ServiceHttpError::ModelNotFound(model.to_string()));
+        }
+        Ok(())
+    }
+
+    fn get(&self, model: &str) -> Option<&E> {
+        self.engines.get(model)
+    }
+
+    fn contains(&self, model: &str) -> bool {
+        self.engines.contains_key(model)
+    }
+
+    fn list(&self) -> Vec<String> {
+        self.engines.keys().map(|k| k.to_owned()).collect()
+    }
+}
+
+/// The DeploymentState is a global state that is shared across all the workers
+/// this provides set of known clients to Engines
+pub struct DeploymentState {
+    completion_engines: Arc<Mutex<ModelEngines<OpenAICompletionsStreamingEngine>>>,
+    chat_completion_engines: Arc<Mutex<ModelEngines<OpenAIChatCompletionsStreamingEngine>>>,
+    metrics: Arc<Metrics>,
+}
+
+impl DeploymentState {
+    fn new() -> Self {
+        Self {
+            completion_engines: Arc::new(Mutex::new(ModelEngines::default())),
+            chat_completion_engines: Arc::new(Mutex::new(ModelEngines::default())),
+            metrics: Arc::new(Metrics::default()),
+        }
+    }
+
+    fn get_completions_engine(
+        &self,
+        model: &str,
+    ) -> Result<OpenAICompletionsStreamingEngine, ServiceHttpError> {
+        self.completion_engines
+            .lock()
+            .unwrap()
+            .get(model)
+            .cloned()
+            .ok_or(ServiceHttpError::ModelNotFound(model.to_string()))
+    }
+
+    fn get_chat_completions_engine(
+        &self,
+        model: &str,
+    ) -> Result<OpenAIChatCompletionsStreamingEngine, ServiceHttpError> {
+        self.chat_completion_engines
+            .lock()
+            .unwrap()
+            .get(model)
+            .cloned()
+            .ok_or(ServiceHttpError::ModelNotFound(model.to_string()))
+    }
+}
+
+/// Documentation for a route
+#[derive(Debug)]
+pub struct RouteDoc {
+    method: axum::http::Method,
+    path: String,
+}
+
+impl std::fmt::Display for RouteDoc {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{} {}", self.method, self.path)
+    }
+}
+
+impl RouteDoc {
+    pub fn new<T: Into<String>>(method: axum::http::Method, path: T) -> Self {
+        RouteDoc {
+            method,
+            path: path.into(),
+        }
+    }
+}
--- a/llm/rust/triton-llm/src/http/service/error.rs
+++ b/llm/rust/triton-llm/src/http/service/error.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use thiserror::Error;
+
+#[derive(Debug, Error)]
+pub enum ServiceHttpError {
+    #[error("Model not found: {0}")]
+    ModelNotFound(String),
+
+    #[error("Model already exists: {0}")]
+    ModelAlreadyExists(String),
+}
+
+/// Implementation of the Completion Engines served by the HTTP service should
+/// map their custom errors to to this error type if they wish to return error
+/// codes besides 500.
+#[derive(Debug, Error)]
+#[error("HTTP Error {code}: {message}")]
+pub struct HttpError {
+    pub code: u16,
+    pub message: String,
+}
--- a/llm/rust/triton-llm/src/http/service/metrics.rs
+++ b/llm/rust/triton-llm/src/http/service/metrics.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use axum::{extract::State, http::StatusCode, response::IntoResponse, routing::get, Router};
+use prometheus::{Encoder, HistogramOpts, HistogramVec, IntCounterVec, IntGaugeVec, Opts};
+use std::{sync::Arc, time::Instant};
+
+pub use prometheus::Registry;
+
+use super::{DeploymentState, RouteDoc};
+
+/// Value for the `status` label in the request counter for successful requests
+pub const REQUEST_STATUS_SUCCESS: &str = "success";
+
+/// Value for the `status` label in the request counter if the request failed
+pub const REQUEST_STATUS_ERROR: &str = "error";
+
+/// Partial value for the `type` label in the request counter for streaming requests
+pub const REQUEST_TYPE_STREAM: &str = "stream";
+
+/// Partial value for the `type` label in the request counter for unary requests
+pub const REQUEST_TYPE_UNARY: &str = "unary";
+
+pub struct Metrics {
+    request_counter: IntCounterVec,
+    inflight_gauge: IntGaugeVec,
+    request_duration: HistogramVec,
+}
+
+/// RAII object for inflight gauge and request counters
+/// If this object is dropped without calling `mark_ok`, then the request will increment
+/// the request counter with the `status` label with [`REQUEST_STATUS_ERROR`]; otherwise, it will increment
+/// the counter with `status` label [`REQUEST_STATUS_SUCCESS`]
+pub struct InflightGuard {
+    metrics: Arc<Metrics>,
+    model: String,
+    endpoint: Endpoint,
+    request_type: RequestType,
+    status: Status,
+    timer: Instant,
+}
+
+/// Requests will be logged by the type of endpoint hit
+/// This will include llamastack in the future
+pub enum Endpoint {
+    /// OAI Completions
+    Completions,
+
+    /// OAI Chat Completions
+    ChatCompletions,
+}
+
+/// Metrics for the HTTP service
+pub enum RequestType {
+    /// SingleIn / SingleOut
+    Unary,
+
+    /// SingleIn / ManyOut
+    Stream,
+}
+
+/// Status
+pub enum Status {
+    Success,
+    Error,
+}
+
+impl Default for Metrics {
+    fn default() -> Self {
+        Self::new("nv_llm")
+    }
+}
+
+impl Metrics {
+    /// Create Metrics with the given prefix
+    /// The following metrics will be created:
+    /// - `{prefix}_http_service_requests_total` - IntCounterVec for the total number of requests processed
+    /// - `{prefix}_http_service_inflight_requests` - IntGaugeVec for the number of inflight requests
+    /// - `{prefix}_http_service_request_duration_seconds` - HistogramVec for the duration of requests
+    pub fn new(prefix: &str) -> Self {
+        let request_counter = IntCounterVec::new(
+            Opts::new(
+                format!("{}_http_service_requests_total", prefix),
+                "Total number of LLM requests processed",
+            ),
+            &["model", "endpoint", "request_type", "status"],
+        )
+        .unwrap();
+
+        let inflight_gauge = IntGaugeVec::new(
+            Opts::new(
+                format!("{}_http_service_inflight_requests", prefix),
+                "Number of inflight requests",
+            ),
+            &["model"],
+        )
+        .unwrap();
+
+        let buckets = vec![0.0, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0];
+
+        let request_duration = HistogramVec::new(
+            HistogramOpts::new(
+                format!("{}_http_service_request_duration_seconds", prefix),
+                "Duration of LLM requests",
+            )
+            .buckets(buckets),
+            &["model"],
+        )
+        .unwrap();
+
+        Metrics {
+            request_counter,
+            inflight_gauge,
+            request_duration,
+        }
+    }
+
+    /// Get the number of successful requests for the given dimensions:
+    /// - model
+    /// - endpoint (completions/chat_completions)
+    /// - request type (unary/stream)
+    /// - status (success/error)
+    pub fn get_request_counter(
+        &self,
+        model: &str,
+        endpoint: &Endpoint,
+        request_type: &RequestType,
+        status: &Status,
+    ) -> u64 {
+        self.request_counter
+            .with_label_values(&[
+                model,
+                endpoint.as_str(),
+                request_type.as_str(),
+                status.as_str(),
+            ])
+            .get()
+    }
+
+    /// Increment the counter for requests for the given dimensions:
+    /// - model
+    /// - endpoint (completions/chat_completions)
+    /// - request type (unary/stream)
+    /// - status (success/error)
+    fn inc_request_counter(
+        &self,
+        model: &str,
+        endpoint: &Endpoint,
+        request_type: &RequestType,
+        status: &Status,
+    ) {
+        self.request_counter
+            .with_label_values(&[
+                model,
+                endpoint.as_str(),
+                request_type.as_str(),
+                status.as_str(),
+            ])
+            .inc()
+    }
+
+    /// Get the number if inflight requests for the given model
+    pub fn get_inflight_count(&self, model: &str) -> i64 {
+        self.inflight_gauge.with_label_values(&[model]).get()
+    }
+
+    fn inc_inflight_gauge(&self, model: &str) {
+        self.inflight_gauge.with_label_values(&[model]).inc()
+    }
+
+    fn dec_inflight_gauge(&self, model: &str) {
+        self.inflight_gauge.with_label_values(&[model]).dec()
+    }
+
+    pub fn register(&self, registry: &Registry) -> Result<(), prometheus::Error> {
+        registry.register(Box::new(self.request_counter.clone()))?;
+        registry.register(Box::new(self.inflight_gauge.clone()))?;
+        registry.register(Box::new(self.request_duration.clone()))?;
+        Ok(())
+    }
+}
+
+impl DeploymentState {
+    /// Create a new [`InflightGuard`] for the given model and annotate if its a streaming request,
+    /// and the kind of endpoint that was hit
+    ///
+    /// The [`InflightGuard`] is an RAII object will handle incrementing the inflight gauge and
+    /// request counters.
+    pub fn create_inflight_guard(
+        &self,
+        model: &str,
+        endpoint: Endpoint,
+        streaming: bool,
+    ) -> InflightGuard {
+        let request_type = if streaming {
+            RequestType::Stream
+        } else {
+            RequestType::Unary
+        };
+
+        InflightGuard::new(
+            self.metrics.clone(),
+            model.to_string(),
+            endpoint,
+            request_type,
+        )
+    }
+}
+
+impl InflightGuard {
+    fn new(
+        metrics: Arc<Metrics>,
+        model: String,
+        endpoint: Endpoint,
+        request_type: RequestType,
+    ) -> Self {
+        // Start the timer
+        let timer = Instant::now();
+
+        // Increment the inflight gauge when the guard is created
+        metrics.inc_inflight_gauge(&model);
+
+        // Return the RAII Guard
+        InflightGuard {
+            metrics,
+            model,
+            endpoint,
+            request_type,
+            status: Status::Error,
+            timer,
+        }
+    }
+
+    pub(crate) fn mark_ok(&mut self) {
+        self.status = Status::Success;
+    }
+}
+
+impl Drop for InflightGuard {
+    fn drop(&mut self) {
+        // Decrement the gauge when the guard is dropped
+        self.metrics.dec_inflight_gauge(&self.model);
+
+        // the frequency on incrementing the full request counter is relatively low
+        // if we were incrementing the counter on every forward pass, we'd use static CounterVec or
+        // discrete counter object without the more costly lookup required for the following calls
+        self.metrics.inc_request_counter(
+            &self.model,
+            &self.endpoint,
+            &self.request_type,
+            &self.status,
+        );
+
+        // Record the duration of the request
+        self.metrics
+            .request_duration
+            .with_label_values(&[&self.model])
+            .observe(self.timer.elapsed().as_secs_f64());
+    }
+}
+
+impl std::fmt::Display for Endpoint {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Endpoint::Completions => write!(f, "completions"),
+            Endpoint::ChatCompletions => write!(f, "chat_completions"),
+        }
+    }
+}
+
+impl Endpoint {
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            Endpoint::Completions => "completions",
+            Endpoint::ChatCompletions => "chat_completions",
+        }
+    }
+}
+
+impl RequestType {
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            RequestType::Unary => REQUEST_TYPE_UNARY,
+            RequestType::Stream => REQUEST_TYPE_STREAM,
+        }
+    }
+}
+
+impl Status {
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            Status::Success => REQUEST_STATUS_SUCCESS,
+            Status::Error => REQUEST_STATUS_ERROR,
+        }
+    }
+}
+
+/// Create a new router with the given path
+pub fn router(registry: Registry, path: Option<String>) -> (Vec<RouteDoc>, Router) {
+    let registry = Arc::new(registry);
+    let path = path.unwrap_or_else(|| "/metrics".to_string());
+    let doc = RouteDoc::new(axum::http::Method::GET, &path);
+    let route = Router::new()
+        .route(&path, get(handler_metrics))
+        .with_state(registry);
+    (vec![doc], route)
+}
+
+/// Metrics Handler
+async fn handler_metrics(State(registry): State<Arc<Registry>>) -> impl IntoResponse {
+    let encoder = prometheus::TextEncoder::new();
+    let metric_families = registry.gather();
+    let mut buffer = vec![];
+    if encoder.encode(&metric_families, &mut buffer).is_err() {
+        return (
+            StatusCode::INTERNAL_SERVER_ERROR,
+            "Failed to encode metrics",
+        )
+            .into_response();
+    }
+
+    let metrics = match String::from_utf8(buffer) {
+        Ok(metrics) => metrics,
+        Err(_) => {
+            return (
+                StatusCode::INTERNAL_SERVER_ERROR,
+                "Failed to encode metrics",
+            )
+                .into_response()
+        }
+    };
+
+    (StatusCode::OK, metrics).into_response()
+}
--- a/llm/rust/triton-llm/src/http/service/openai.rs
+++ b/llm/rust/triton-llm/src/http/service/openai.rs
--- a/llm/rust/triton-llm/src/http/service/service_v2.rs
+++ b/llm/rust/triton-llm/src/http/service/service_v2.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::metrics;
+use super::ModelManager;
+use derive_builder::Builder;
+use tokio_util::sync::CancellationToken;
+
+#[derive(Clone)]
+pub struct HttpService {
+    models: ModelManager,
+    router: axum::Router,
+    port: u16,
+}
+
+#[derive(Clone, Builder)]
+#[builder(build_fn(private, name = "build_internal"))]
+pub struct HttpServiceConfig {
+    #[builder(default = "8787")]
+    port: u16,
+
+    // #[builder(default)]
+    // custom: Vec<axum::Router>
+    #[builder(default = "true")]
+    enable_chat_endpoints: bool,
+
+    #[builder(default = "true")]
+    enable_cmpl_endpoints: bool,
+}
+
+impl HttpService {
+    pub fn builder() -> HttpServiceConfigBuilder {
+        HttpServiceConfigBuilder::default()
+    }
+
+    pub fn model_manager(&self) -> &ModelManager {
+        &self.models
+    }
+
+    pub async fn run(&self, cancel_token: CancellationToken) -> anyhow::Result<()> {
+        let address = format!("0.0.0.0:{}", self.port);
+        tracing::info!(address, "Starting HTTP service on: {address}");
+
+        let listener = tokio::net::TcpListener::bind(address.as_str())
+            .await
+            .unwrap_or_else(|_| panic!("could not bind to address: {address}"));
+
+        let router = self.router.clone();
+        let observer = cancel_token.child_token();
+
+        Ok(axum::serve(listener, router)
+            .with_graceful_shutdown(observer.cancelled_owned())
+            .await
+            .inspect_err(|_| cancel_token.cancel())?)
+    }
+}
+
+impl HttpServiceConfigBuilder {
+    pub fn build(self) -> Result<HttpService, anyhow::Error> {
+        let config = self.build_internal()?;
+
+        let model_manager = ModelManager::new();
+
+        // enable prometheus metrics
+        let registry = metrics::Registry::new();
+        model_manager.metrics().register(&registry)?;
+
+        let mut router = axum::Router::new();
+        let mut all_docs = Vec::new();
+
+        let mut routes = vec![
+            metrics::router(registry, None),
+            super::openai::list_models_router(model_manager.state(), None),
+        ];
+
+        if config.enable_chat_endpoints {
+            routes.push(super::openai::completions_router(
+                model_manager.state(),
+                None,
+            ));
+        }
+
+        if config.enable_cmpl_endpoints {
+            routes.push(super::openai::chat_completions_router(
+                model_manager.state(),
+                None,
+            ));
+        }
+
+        // for (route_docs, route) in routes.into_iter().chain(self.routes.into_iter()) {
+        //     router = router.merge(route);
+        //     all_docs.extend(route_docs);
+        // }
+
+        for (route_docs, route) in routes.into_iter() {
+            router = router.merge(route);
+            all_docs.extend(route_docs);
+        }
+
+        Ok(HttpService {
+            models: model_manager,
+            router,
+            port: config.port,
+        })
+    }
+}
--- a/llm/rust/triton-llm/src/lib.rs
+++ b/llm/rust/triton-llm/src/lib.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! # Triton LLM
+//!
+//! The `triton-llm` crate is a Rust library that provides a set of traits and types for building
+//! distributed LLM inference solutions.
+
+pub mod http;
+pub mod protocols;
+pub mod types;
--- a/llm/rust/triton-llm/src/protocols.rs
+++ b/llm/rust/triton-llm/src/protocols.rs
--- a/llm/rust/triton-llm/src/protocols/codec.rs
+++ b/llm/rust/triton-llm/src/protocols/codec.rs
--- a/llm/rust/triton-llm/src/protocols/common.rs
+++ b/llm/rust/triton-llm/src/protocols/common.rs
--- a/llm/rust/triton-llm/src/protocols/common/kv_routing.rs
+++ b/llm/rust/triton-llm/src/protocols/common/kv_routing.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use serde::{Deserialize, Serialize};
+
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
+pub struct ForwardPassMetrics {
+    pub request_active_slots: u64,
+    pub request_total_slots: u64,
+    pub kv_active_blocks: u64,
+    pub kv_total_blocks: u64,
+}
--- a/llm/rust/triton-llm/src/protocols/common/llm_backend.rs
+++ b/llm/rust/triton-llm/src/protocols/common/llm_backend.rs
--- a/llm/rust/triton-llm/src/protocols/common/postprocessor.rs
+++ b/llm/rust/triton-llm/src/protocols/common/postprocessor.rs
--- a/llm/rust/triton-llm/src/protocols/common/preprocessor.rs
+++ b/llm/rust/triton-llm/src/protocols/common/preprocessor.rs
--- a/llm/rust/triton-llm/src/protocols/openai.rs
+++ b/llm/rust/triton-llm/src/protocols/openai.rs
--- a/llm/rust/triton-llm/src/protocols/openai/chat_completions.rs
+++ b/llm/rust/triton-llm/src/protocols/openai/chat_completions.rs