// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 use serde::{Deserialize, Serialize}; pub use super::FinishReason; pub use super::preprocessor::PreprocessedRequest; use crate::protocols::TokenIdType; use dynamo_async_openai::types::CompletionUsage; use dynamo_async_openai::types::StopReason; use dynamo_runtime::protocols::maybe_error::MaybeError; pub type TokenType = Option; pub type LogProbs = Vec; /// Output type discriminator for different modalities #[derive(Serialize, Deserialize, Debug, Clone, Copy, PartialEq, Eq, Default)] #[serde(rename_all = "lowercase")] pub enum OutputType { #[default] Text, Image, Video, Audio, } /// Image URL data for responses #[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] pub struct ImageUrlData { pub url: String, } /// Video URL data for responses #[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] pub struct VideoUrlData { pub url: String, } /// Audio URL data for responses #[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] pub struct AudioUrlData { pub url: String, } /// Content part for multimodal outputs (internal representation) #[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] #[serde(tag = "type", rename_all = "snake_case")] pub enum ContentPart { Text { text: String }, ImageUrl { image_url: ImageUrlData }, VideoUrl { video_url: VideoUrlData }, AudioUrl { audio_url: AudioUrlData }, } #[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] pub struct TopLogprob { pub rank: u32, pub token_id: TokenIdType, pub token: TokenType, pub logprob: f64, } pub type TopLogprobs = Vec>; // num_tokens x top_logprobs #[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] pub struct BackendOutput { /// New token_ids generated from the LLM Engine pub token_ids: Vec, /// Unlike [`LLMEngineOutput::tokens`], this is a vector of tokens, not an optional. /// The size of this vector should be the same as the size of `token_ids`. pub tokens: Vec, /// Decoded text from the list tokens. pub text: Option, /// Optional cumulative log probabilities pub cum_log_probs: Option, /// Optional log probabilities pub log_probs: Option, pub top_logprobs: Option, // TODO: Enrich this with more information as can apply our first-level postprocessing // logic and return more detailed information pub finish_reason: Option, /// The stop string or token that triggered the stop condition. /// This is set when finish_reason is Stop and identifies what triggered it. #[serde(default, skip_serializing_if = "Option::is_none")] pub stop_reason: Option, // Model Deployment Card checksum //pub mdcsum: String, // Index field for batch requests to match OpenAI format pub index: Option, // Token usage information #[serde(default, skip_serializing_if = "Option::is_none")] pub completion_usage: Option, /// Disaggregated execution parameters (for prefill/decode separation) #[serde(default, skip_serializing_if = "Option::is_none")] pub disaggregated_params: Option, } /// The LLM engine and backnd with manage it's own state, specifically translating how a /// given request/slot is managed on that particular backend. /// /// For nvLLM's purpose, it has a single tracable request_id as part of it's context that /// has propaged through the service pipeline to the backend. /// /// This is the minimal raw output from the LLM engine. The Backend may then apply multiple /// levels of post-processing before the BackendOutput is returns #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Default)] pub struct LLMEngineOutput { // new token_ids pub token_ids: Vec, /// If the LLM Engine performs the detokenization, then this will have a Some of the detokenized /// text/tokens. If this value is None, then the Backend is responsible for detokenization. pub tokens: Option>, // decoded text - pub text: Option, /// Output type discriminator (text, image, video, audio) #[serde(default)] pub output_type: OutputType, /// Multimodal content parts (for non-text outputs) #[serde(default, skip_serializing_if = "Option::is_none")] pub content_parts: Option>, /// cumulative log probabilities pub cum_log_probs: Option, /// Optional log probabilities pub log_probs: Option, pub top_logprobs: Option, // TODO: Enrich this with more information as can apply our first-level postprocessing // logic and return more detailed information pub finish_reason: Option, /// The stop string or token that triggered the stop condition. /// This is set when finish_reason is Stop and identifies what triggered it. #[serde(default, skip_serializing_if = "Option::is_none")] pub stop_reason: Option, // Index field for batch requests to match OpenAI format pub index: Option, /// Disaggregated execution parameters (for prefill/decode separation) #[serde(default, skip_serializing_if = "Option::is_none")] pub disaggregated_params: Option, /// Additional arguments for extensibility #[serde(default, skip_serializing_if = "Option::is_none")] pub extra_args: Option, // Token usage information #[serde(default, skip_serializing_if = "Option::is_none")] pub completion_usage: Option, } impl LLMEngineOutput { pub fn cancelled() -> Self { LLMEngineOutput { token_ids: vec![], tokens: None, text: None, output_type: OutputType::default(), content_parts: None, cum_log_probs: None, log_probs: None, top_logprobs: None, finish_reason: Some(FinishReason::Cancelled), stop_reason: None, index: None, disaggregated_params: None, extra_args: None, completion_usage: None, } } pub fn stop() -> Self { LLMEngineOutput { token_ids: vec![], tokens: None, text: None, output_type: OutputType::default(), content_parts: None, cum_log_probs: None, log_probs: None, finish_reason: Some(FinishReason::Stop), stop_reason: None, top_logprobs: None, index: None, disaggregated_params: None, extra_args: None, completion_usage: None, } } pub fn length() -> Self { LLMEngineOutput { token_ids: vec![], tokens: None, text: None, output_type: OutputType::default(), content_parts: None, cum_log_probs: None, log_probs: None, top_logprobs: None, finish_reason: Some(FinishReason::Length), stop_reason: None, index: None, disaggregated_params: None, extra_args: None, completion_usage: None, } } pub fn error(err_msg: String) -> Self { LLMEngineOutput { token_ids: vec![], tokens: None, text: None, output_type: OutputType::default(), content_parts: None, cum_log_probs: None, log_probs: None, top_logprobs: None, finish_reason: Some(FinishReason::Error(err_msg)), stop_reason: None, index: None, disaggregated_params: None, extra_args: None, completion_usage: None, } } } impl MaybeError for LLMEngineOutput { fn from_err(err: Box) -> Self { LLMEngineOutput::error(format!("{:?}", err)) } fn err(&self) -> Option { if let Some(FinishReason::Error(err_msg)) = &self.finish_reason { Some(anyhow::Error::msg(err_msg.clone())) } else { None } } } /// Raw output from embedding engines containing embedding vectors #[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] pub struct EmbeddingsEngineOutput { /// Generated embedding vectors (one per input text) pub embeddings: Vec>, /// Token usage information pub prompt_tokens: u32, pub total_tokens: u32, } #[cfg(test)] mod tests { use super::*; #[test] fn test_maybe_error() { let output = LLMEngineOutput::stop(); assert!(output.err().is_none()); assert!(output.is_ok()); assert!(!output.is_err()); let output = LLMEngineOutput::error("Test error".to_string()); assert_eq!(format!("{}", output.err().unwrap()), "Test error"); assert!(!output.is_ok()); assert!(output.is_err()); } }