Unverified Commit f63e273c authored by smatta-star's avatar smatta-star Committed by GitHub
Browse files

feat: add auto-generated frontend OpenAPI spec and helper binary (#4802)


Signed-off-by: default avatarSatvik Matta <smatta@nvidia.com>
parent ac8c9023
...@@ -2575,6 +2575,7 @@ dependencies = [ ...@@ -2575,6 +2575,7 @@ dependencies = [
"tokio-util", "tokio-util",
"tracing", "tracing",
"url", "url",
"utoipa",
"uuid 1.18.1", "uuid 1.18.1",
] ]
...@@ -11781,6 +11782,8 @@ dependencies = [ ...@@ -11781,6 +11782,8 @@ dependencies = [
"quote", "quote",
"regex", "regex",
"syn 2.0.111", "syn 2.0.111",
"url",
"uuid 1.18.1",
] ]
[[package]] [[package]]
......
...@@ -181,6 +181,17 @@ Dynamo provides comprehensive benchmarking tools to evaluate and optimize your d ...@@ -181,6 +181,17 @@ Dynamo provides comprehensive benchmarking tools to evaluate and optimize your d
- **[Benchmarking Guide](docs/benchmarks/benchmarking.md)** – Compare deployment topologies (aggregated vs. disaggregated vs. vanilla vLLM) using AIPerf - **[Benchmarking Guide](docs/benchmarks/benchmarking.md)** – Compare deployment topologies (aggregated vs. disaggregated vs. vanilla vLLM) using AIPerf
- **[SLA-Driven Dynamo Deployments](docs/planner/sla_planner_quickstart.md)** – Optimize your deployment to meet SLA requirements - **[SLA-Driven Dynamo Deployments](docs/planner/sla_planner_quickstart.md)** – Optimize your deployment to meet SLA requirements
## Frontend OpenAPI specification
The OpenAI-compatible HTTP frontend exposes an OpenAPI 3 specification at `/openapi.json`.
To generate and persist the same specification without running the server (for example for CI, documentation, or NIM integration), run:
```bash
cargo run -p dynamo-llm --bin generate-frontend-openapi
```
This writes the current frontend spec to `docs/frontends/openapi.json` at the repository root.
# Engines # Engines
Dynamo is designed to be inference engine agnostic. To use any engine with Dynamo, NATS and etcd need to be installed, along with a Dynamo frontend (`python -m dynamo.frontend [--interactive]`). Dynamo is designed to be inference engine agnostic. To use any engine with Dynamo, NATS and etcd need to be installed, along with a Dynamo frontend (`python -m dynamo.frontend [--interactive]`).
......
This diff is collapsed.
...@@ -59,6 +59,7 @@ secrecy = { version = "0.10.3", features = ["serde"] } ...@@ -59,6 +59,7 @@ secrecy = { version = "0.10.3", features = ["serde"] }
bytes = "1.9.0" bytes = "1.9.0"
eventsource-stream = "0.2.3" eventsource-stream = "0.2.3"
tokio-tungstenite = { version = "0.26.1", optional = true, default-features = false } tokio-tungstenite = { version = "0.26.1", optional = true, default-features = false }
utoipa = { version = "5.3", features = ["url", "uuid"] }
[dev-dependencies] [dev-dependencies]
tokio-test = "0.4.4" tokio-test = "0.4.4"
......
This diff is collapsed.
...@@ -13,6 +13,7 @@ use std::{collections::HashMap, pin::Pin}; ...@@ -13,6 +13,7 @@ use std::{collections::HashMap, pin::Pin};
use derive_builder::Builder; use derive_builder::Builder;
use futures::Stream; use futures::Stream;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use utoipa::ToSchema;
use crate::error::OpenAIError; use crate::error::OpenAIError;
...@@ -89,7 +90,7 @@ where ...@@ -89,7 +90,7 @@ where
deserializer.deserialize_option(StrictBoolVisitor) deserializer.deserialize_option(StrictBoolVisitor)
} }
#[derive(Clone, Serialize, Deserialize, Default, Debug, Builder, PartialEq)] #[derive(ToSchema, Clone, Serialize, Deserialize, Default, Debug, Builder, PartialEq)]
#[builder(name = "CreateCompletionRequestArgs")] #[builder(name = "CreateCompletionRequestArgs")]
#[builder(pattern = "mutable")] #[builder(pattern = "mutable")]
#[builder(setter(into, strip_option), default)] #[builder(setter(into, strip_option), default)]
...@@ -197,7 +198,7 @@ pub struct CreateCompletionRequest { ...@@ -197,7 +198,7 @@ pub struct CreateCompletionRequest {
pub seed: Option<i64>, pub seed: Option<i64>,
} }
#[derive(Debug, Deserialize, Clone, PartialEq, Serialize)] #[derive(ToSchema, Debug, Deserialize, Clone, PartialEq, Serialize)]
pub struct CreateCompletionResponse { pub struct CreateCompletionResponse {
/// A unique identifier for the completion. /// A unique identifier for the completion.
pub id: String, pub id: String,
......
...@@ -11,10 +11,11 @@ ...@@ -11,10 +11,11 @@
use base64::engine::{Engine, general_purpose}; use base64::engine::{Engine, general_purpose};
use derive_builder::Builder; use derive_builder::Builder;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use utoipa::ToSchema;
use crate::error::OpenAIError; use crate::error::OpenAIError;
#[derive(Debug, Serialize, Clone, PartialEq, Deserialize)] #[derive(ToSchema, Debug, Serialize, Clone, PartialEq, Deserialize)]
#[serde(untagged)] #[serde(untagged)]
pub enum EmbeddingInput { pub enum EmbeddingInput {
String(String), String(String),
...@@ -24,7 +25,7 @@ pub enum EmbeddingInput { ...@@ -24,7 +25,7 @@ pub enum EmbeddingInput {
ArrayOfIntegerArray(Vec<Vec<u32>>), ArrayOfIntegerArray(Vec<Vec<u32>>),
} }
#[derive(Debug, Serialize, Default, Clone, PartialEq, Deserialize)] #[derive(ToSchema, Debug, Serialize, Default, Clone, PartialEq, Deserialize)]
#[serde(rename_all = "lowercase")] #[serde(rename_all = "lowercase")]
pub enum EncodingFormat { pub enum EncodingFormat {
#[default] #[default]
...@@ -32,7 +33,7 @@ pub enum EncodingFormat { ...@@ -32,7 +33,7 @@ pub enum EncodingFormat {
Base64, Base64,
} }
#[derive(Debug, Serialize, Default, Clone, Builder, PartialEq, Deserialize)] #[derive(ToSchema, Debug, Serialize, Default, Clone, Builder, PartialEq, Deserialize)]
#[builder(name = "CreateEmbeddingRequestArgs")] #[builder(name = "CreateEmbeddingRequestArgs")]
#[builder(pattern = "mutable")] #[builder(pattern = "mutable")]
#[builder(setter(into, strip_option), default)] #[builder(setter(into, strip_option), default)]
...@@ -64,7 +65,7 @@ pub struct CreateEmbeddingRequest { ...@@ -64,7 +65,7 @@ pub struct CreateEmbeddingRequest {
} }
/// Represents an embedding vector returned by embedding endpoint. /// Represents an embedding vector returned by embedding endpoint.
#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)] #[derive(ToSchema, Debug, Deserialize, Serialize, Clone, PartialEq)]
pub struct Embedding { pub struct Embedding {
/// The index of the embedding in the list of embeddings. /// The index of the embedding in the list of embeddings.
pub index: u32, pub index: u32,
...@@ -75,7 +76,7 @@ pub struct Embedding { ...@@ -75,7 +76,7 @@ pub struct Embedding {
pub embedding: Vec<f32>, pub embedding: Vec<f32>,
} }
#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)] #[derive(ToSchema, Debug, Deserialize, Serialize, Clone, PartialEq)]
pub struct Base64EmbeddingVector(pub String); pub struct Base64EmbeddingVector(pub String);
impl From<Base64EmbeddingVector> for Vec<f32> { impl From<Base64EmbeddingVector> for Vec<f32> {
...@@ -91,7 +92,7 @@ impl From<Base64EmbeddingVector> for Vec<f32> { ...@@ -91,7 +92,7 @@ impl From<Base64EmbeddingVector> for Vec<f32> {
} }
/// Represents an base64-encoded embedding vector returned by embedding endpoint. /// Represents an base64-encoded embedding vector returned by embedding endpoint.
#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)] #[derive(ToSchema, Debug, Deserialize, Serialize, Clone, PartialEq)]
pub struct Base64Embedding { pub struct Base64Embedding {
/// The index of the embedding in the list of embeddings. /// The index of the embedding in the list of embeddings.
pub index: u32, pub index: u32,
...@@ -101,7 +102,7 @@ pub struct Base64Embedding { ...@@ -101,7 +102,7 @@ pub struct Base64Embedding {
pub embedding: Base64EmbeddingVector, pub embedding: Base64EmbeddingVector,
} }
#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)] #[derive(ToSchema, Debug, Deserialize, Serialize, Clone, PartialEq)]
pub struct EmbeddingUsage { pub struct EmbeddingUsage {
/// The number of tokens used by the prompt. /// The number of tokens used by the prompt.
pub prompt_tokens: u32, pub prompt_tokens: u32,
...@@ -109,7 +110,7 @@ pub struct EmbeddingUsage { ...@@ -109,7 +110,7 @@ pub struct EmbeddingUsage {
pub total_tokens: u32, pub total_tokens: u32,
} }
#[derive(Debug, Deserialize, Clone, PartialEq, Serialize)] #[derive(ToSchema, Debug, Deserialize, Clone, PartialEq, Serialize)]
pub struct CreateEmbeddingResponse { pub struct CreateEmbeddingResponse {
pub object: String, pub object: String,
/// The name of the model used to generate the embedding. /// The name of the model used to generate the embedding.
...@@ -120,7 +121,7 @@ pub struct CreateEmbeddingResponse { ...@@ -120,7 +121,7 @@ pub struct CreateEmbeddingResponse {
pub usage: EmbeddingUsage, pub usage: EmbeddingUsage,
} }
#[derive(Debug, Deserialize, Clone, PartialEq, Serialize)] #[derive(ToSchema, Debug, Deserialize, Clone, PartialEq, Serialize)]
pub struct CreateBase64EmbeddingResponse { pub struct CreateBase64EmbeddingResponse {
pub object: String, pub object: String,
/// The name of the model used to generate the embedding. /// The name of the model used to generate the embedding.
......
This diff is collapsed.
...@@ -195,6 +195,10 @@ insta = { version = "1.41", features = [ ...@@ -195,6 +195,10 @@ insta = { version = "1.41", features = [
lazy_static = "1.4" lazy_static = "1.4"
mockito = "1.7.0" mockito = "1.7.0"
[[bin]]
name = "generate-frontend-openapi"
path = "src/bin/generate_frontend_openapi.rs"
[build-dependencies] [build-dependencies]
tonic-build = { version = "0.13.1" } tonic-build = { version = "0.13.1" }
......
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Helper binary to generate the Dynamo HTTP frontend OpenAPI specification.
//!
//! This allows CI, documentation tooling, and NIM to obtain the exact same
//! OpenAPI document that is served at `/openapi.json` by the frontend
//! without having to start the HTTP service and scrape the endpoint.
//!
//! Usage (from the repository root):
//! ```bash
//! cargo run -p dynamo-llm --bin generate-frontend-openapi
//! ```
//! The generated spec will be written to:
//! `docs/frontends/openapi.json`
use std::fs;
use std::path::PathBuf;
use std::thread;
use anyhow::Context as _;
use dynamo_llm::http::service::{openapi_docs, service_v2::HttpService};
/// Stack size for the generator thread (8 MB).
/// The utoipa schema derivation for deeply nested OpenAI types requires
/// additional stack space due to recursive type expansion.
const GENERATOR_STACK_SIZE: usize = 8 * 1024 * 1024;
fn main() -> anyhow::Result<()> {
// Spawn a thread with a larger stack to handle deeply nested schema generation
let handle = thread::Builder::new()
.stack_size(GENERATOR_STACK_SIZE)
.spawn(generate_openapi)
.context("failed to spawn generator thread")?;
handle
.join()
.map_err(|e| anyhow::anyhow!("generator thread panicked: {:?}", e))?
}
fn generate_openapi() -> anyhow::Result<()> {
// Build an HttpService instance with all standard OpenAI-compatible
// frontend endpoints enabled so that the generated OpenAPI document
// reflects the full surface area exposed to users.
//
// This does NOT start any network listeners; it only builds the router
// graph and associated route documentation.
let http_service = HttpService::builder()
.enable_chat_endpoints(true)
.enable_cmpl_endpoints(true)
.enable_embeddings_endpoints(true)
.enable_responses_endpoints(true)
.build()
.context("failed to build HttpService for OpenAPI generation")?;
let route_docs = http_service.route_docs().to_vec();
let openapi = openapi_docs::generate_openapi_spec(&route_docs);
// Write the spec to a stable location relative to the repository root.
let out_dir = PathBuf::from("docs/frontends");
let out_path = out_dir.join("openapi.json");
fs::create_dir_all(&out_dir)
.with_context(|| format!("failed to create OpenAPI output directory: {out_dir:?}"))?;
let json =
serde_json::to_string_pretty(&openapi).context("failed to serialize OpenAPI spec")?;
fs::write(&out_path, json)
.with_context(|| format!("failed to write OpenAPI spec to: {out_path:?}"))?;
println!(
"Generated Dynamo frontend OpenAPI specification at {}",
out_path.display()
);
Ok(())
}
This diff is collapsed.
This diff is collapsed.
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
use anyhow::Result; use anyhow::Result;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use utoipa::ToSchema;
use super::common::EncodedMediaData; use super::common::EncodedMediaData;
use super::rdma::DecodedMediaData; use super::rdma::DecodedMediaData;
...@@ -34,7 +35,7 @@ pub trait Decoder: Clone + Send + 'static { ...@@ -34,7 +35,7 @@ pub trait Decoder: Clone + Send + 'static {
/// Media decoder configuration. /// Media decoder configuration.
/// Used both for MDC server config and runtime `media_io_kwargs`. /// Used both for MDC server config and runtime `media_io_kwargs`.
/// When used at runtime, limits are enforced from MDC and cannot be overridden. /// When used at runtime, limits are enforced from MDC and cannot be overridden.
#[derive(Clone, Debug, Default, serde::Serialize, serde::Deserialize)] #[derive(Clone, Debug, Default, serde::Serialize, serde::Deserialize, ToSchema)]
pub struct MediaDecoder { pub struct MediaDecoder {
#[serde(default, skip_serializing_if = "Option::is_none")] #[serde(default, skip_serializing_if = "Option::is_none")]
pub image: Option<ImageDecoder>, pub image: Option<ImageDecoder>,
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment