feat: add initial audio/TTS pipeline support for vLLM-omni backend (#7495)

Signed-off-by: Zhongdongming Dai <zhongdongmin@nvidia.com>

feat: add initial audio/TTS pipeline support for vLLM-omni backend (#7495)
Signed-off-by: Zhongdongming Dai <zhongdongmin@nvidia.com>
cbbde3d0 · Zhongdongming Dai · GitHub · 76c70f41 · cbbde3d0 · cbbde3d0
Unverified Commit cbbde3d0 authored Mar 31, 2026 by Zhongdongming Dai Committed by GitHub Mar 31, 2026
7 changed files
--- a/lib/llm/src/protocols/openai/audios.rs
+++ b/lib/llm/src/protocols/openai/audios.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+use dynamo_runtime::protocols::annotated::AnnotationsProvider;
+use serde::{Deserialize, Serialize};
+use validator::Validate;
+
+mod aggregator;
+mod nvext;
+
+pub use aggregator::DeltaAggregator;
+pub use nvext::{NvExt, NvExtProvider};
+
+/// Request for audio speech generation (/v1/audio/speech endpoint).
+///
+/// Follows vLLM-Omni's OpenAICreateSpeechRequest format with TTS-specific
+/// parameters as top-level fields.
+#[derive(Serialize, Deserialize, Validate, Debug, Clone)]
+pub struct NvCreateAudioSpeechRequest {
+    /// The text to synthesize into speech (required)
+    pub input: String,
+
+    /// The TTS model to use
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub model: Option<String>,
+
+    /// Voice/speaker name (e.g., "vivian", "ryan", "aiden")
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub voice: Option<String>,
+
+    /// Output format: "wav", "mp3", "pcm", "flac", "aac", "opus"
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub response_format: Option<String>,
+
+    /// Speed factor (0.25-4.0, default: 1.0)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub speed: Option<f64>,
+
+    // Qwen3-TTS specific parameters (top-level, matching vLLM-Omni)
+    /// TTS task type: "CustomVoice", "VoiceDesign", or "Base"
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub task_type: Option<String>,
+
+    /// Language: "Auto", "Chinese", "English", "Japanese", etc.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub language: Option<String>,
+
+    /// Voice style/emotion instructions (for VoiceDesign)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub instructions: Option<String>,
+
+    /// Reference audio URL or base64 (for voice cloning with Base task)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub ref_audio: Option<String>,
+
+    /// Reference transcript (for voice cloning with Base task)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub ref_text: Option<String>,
+
+    /// Maximum tokens to generate (default: 2048)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub max_new_tokens: Option<i32>,
+
+    /// Optional user identifier
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub user: Option<String>,
+
+    /// NVIDIA extensions (reserved for future use)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub nvext: Option<NvExt>,
+}
+
+/// Audio data in response
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub struct AudioData {
+    /// URL of the generated audio (if response_format is "url")
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub url: Option<String>,
+
+    /// Base64-encoded audio data
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub b64_json: Option<String>,
+}
+
+/// Response structure for audio speech generation
+#[derive(Serialize, Deserialize, Validate, Debug, Clone)]
+pub struct NvAudioSpeechResponse {
+    /// Unique identifier for the response
+    pub id: String,
+
+    /// Object type (always "audio.speech")
+    #[serde(default = "default_object_type")]
+    pub object: String,
+
+    /// Model used for generation
+    pub model: String,
+
+    /// Status of the generation ("completed", "failed", etc.)
+    #[serde(default = "default_status")]
+    pub status: String,
+
+    /// Progress percentage (0-100)
+    #[serde(default = "default_progress")]
+    pub progress: i32,
+
+    /// Unix timestamp of creation
+    pub created: i64,
+
+    /// Generated audio data
+    #[serde(default)]
+    pub data: Vec<AudioData>,
+
+    /// Error message if generation failed
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub error: Option<String>,
+
+    /// Inference time in seconds
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub inference_time_s: Option<f64>,
+}
+
+fn default_object_type() -> String {
+    "audio.speech".to_string()
+}
+
+fn default_status() -> String {
+    "completed".to_string()
+}
+
+fn default_progress() -> i32 {
+    100
+}
+
+impl NvAudioSpeechResponse {
+    pub fn empty() -> Self {
+        Self {
+            id: String::new(),
+            object: "audio.speech".to_string(),
+            model: String::new(),
+            status: "completed".to_string(),
+            progress: 100,
+            created: 0,
+            data: vec![],
+            error: None,
+            inference_time_s: None,
+        }
+    }
+}
+
+/// Implements `NvExtProvider` for `NvCreateAudioSpeechRequest`.
+impl NvExtProvider for NvCreateAudioSpeechRequest {
+    fn nvext(&self) -> Option<&NvExt> {
+        self.nvext.as_ref()
+    }
+}
+
+/// Implements `AnnotationsProvider` for `NvCreateAudioSpeechRequest`.
+impl AnnotationsProvider for NvCreateAudioSpeechRequest {
+    fn annotations(&self) -> Option<Vec<String>> {
+        self.nvext
+            .as_ref()
+            .and_then(|nvext| nvext.annotations.clone())
+    }
+
+    fn has_annotation(&self, annotation: &str) -> bool {
+        self.nvext
+            .as_ref()
+            .and_then(|nvext| nvext.annotations.as_ref())
+            .map(|annotations| annotations.contains(&annotation.to_string()))
+            .unwrap_or(false)
+    }
+}
--- a/lib/llm/src/protocols/openai/audios/aggregator.rs
+++ b/lib/llm/src/protocols/openai/audios/aggregator.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+use futures::{Stream, StreamExt};
+
+use crate::types::Annotated;
+
+use super::NvAudioSpeechResponse;
+
+/// Aggregator for combining audio response deltas into a final response.
+#[derive(Debug)]
+pub struct DeltaAggregator {
+    response: Option<NvAudioSpeechResponse>,
+    error: Option<String>,
+}
+
+impl Default for DeltaAggregator {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl DeltaAggregator {
+    pub fn new() -> Self {
+        DeltaAggregator {
+            response: None,
+            error: None,
+        }
+    }
+
+    /// Aggregates a stream of annotated audio responses into a final response.
+    pub async fn apply(
+        stream: impl Stream<Item = Annotated<NvAudioSpeechResponse>>,
+    ) -> Result<NvAudioSpeechResponse, String> {
+        let aggregator = stream
+            .fold(DeltaAggregator::new(), |mut aggregator, delta| async move {
+                let delta = match delta.ok() {
+                    Ok(delta) => delta,
+                    Err(error) => {
+                        aggregator.error = Some(error);
+                        return aggregator;
+                    }
+                };
+
+                if aggregator.error.is_none()
+                    && let Some(response) = delta.data
+                {
+                    match &mut aggregator.response {
+                        Some(existing) => {
+                            existing.data.extend(response.data);
+                        }
+                        None => {
+                            aggregator.response = Some(response);
+                        }
+                    }
+                }
+                aggregator
+            })
+            .await;
+
+        if let Some(error) = aggregator.error {
+            return Err(error);
+        }
+
+        Ok(aggregator
+            .response
+            .unwrap_or_else(NvAudioSpeechResponse::empty))
+    }
+}
+
+impl NvAudioSpeechResponse {
+    /// Aggregates an annotated stream of audio responses into a final response.
+    pub async fn from_annotated_stream(
+        stream: impl Stream<Item = Annotated<NvAudioSpeechResponse>>,
+    ) -> Result<NvAudioSpeechResponse, String> {
+        DeltaAggregator::apply(stream).await
+    }
+}
--- a/lib/llm/src/protocols/openai/audios/nvext.rs
+++ b/lib/llm/src/protocols/openai/audios/nvext.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+use derive_builder::Builder;
+use serde::{Deserialize, Serialize};
+use utoipa::ToSchema;
+use validator::{Validate, ValidationError};
+
+pub trait NvExtProvider {
+    fn nvext(&self) -> Option<&NvExt>;
+}
+
+/// NVIDIA extensions to the Audio Speech API
+#[derive(ToSchema, Serialize, Deserialize, Builder, Validate, Debug, Clone)]
+#[validate(schema(function = "validate_nv_ext"))]
+pub struct NvExt {
+    /// Annotations for SSE stream events
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    #[builder(default, setter(strip_option))]
+    pub annotations: Option<Vec<String>>,
+
+    /// Language: Auto, Chinese, English, Japanese, Korean, German, French, etc.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[builder(default, setter(strip_option))]
+    pub language: Option<String>,
+
+    /// Task type: CustomVoice, VoiceDesign, or Base
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[builder(default, setter(strip_option))]
+    pub task_type: Option<String>,
+
+    /// Maximum number of tokens to generate (default: 2048)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[builder(default, setter(strip_option))]
+    pub max_new_tokens: Option<i32>,
+
+    /// Reference audio URL or base64 data (for voice cloning)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[builder(default, setter(strip_option))]
+    pub ref_audio: Option<String>,
+
+    /// Reference transcript (for voice cloning)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[builder(default, setter(strip_option))]
+    pub ref_text: Option<String>,
+
+    /// Random seed for reproducibility
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[builder(default, setter(strip_option))]
+    pub seed: Option<i64>,
+}
+
+impl Default for NvExt {
+    fn default() -> Self {
+        NvExt::builder().build().unwrap()
+    }
+}
+
+impl NvExt {
+    pub fn builder() -> NvExtBuilder {
+        NvExtBuilder::default()
+    }
+}
+
+fn validate_nv_ext(_nv_ext: &NvExt) -> Result<(), ValidationError> {
+    Ok(())
+}
+
+impl NvExtBuilder {
+    pub fn add_annotation(&mut self, annotation: impl Into<String>) -> &mut Self {
+        self.annotations
+            .get_or_insert_with(|| Some(vec![]))
+            .as_mut()
+            .expect("annotations should always be Some(Vec)")
+            .push(annotation.into());
+        self
+    }
+}
--- a/lib/llm/src/types.rs
+++ b/lib/llm/src/types.rs
@@ -96,6 +96,20 @@ pub mod openai {
        pub type OpenAIVideosStreamingEngine =
            ServerStreamingEngine<NvCreateVideoRequest, Annotated<NvVideosResponse>>;
    }
+
+    pub mod audios {
+        use super::*;
+
+        pub use protocols::openai::audios::{NvAudioSpeechResponse, NvCreateAudioSpeechRequest};
+
+        /// A [`UnaryEngine`] implementation for the Audio Speech API
+        pub type OpenAIAudiosUnaryEngine =
+            UnaryEngine<NvCreateAudioSpeechRequest, NvAudioSpeechResponse>;
+
+        /// A [`ServerStreamingEngine`] implementation for the Audio Speech API
+        pub type OpenAIAudiosStreamingEngine =
+            ServerStreamingEngine<NvCreateAudioSpeechRequest, Annotated<NvAudioSpeechResponse>>;
+    }
 }

 pub mod generic {

--- a/lib/llm/tests/http-service.rs
+++ b/lib/llm/tests/http-service.rs
@@ -222,6 +222,7 @@ fn compute_index(endpoint: &Endpoint, request_type: &RequestType, status: &Statu
        Endpoint::Tensor => todo!(),
        Endpoint::Images => todo!(),
        Endpoint::Videos => todo!(),
+        Endpoint::Audios => todo!(),
    };

    let request_type = match request_type {

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -56,6 +56,8 @@ vllm = [
    # not include vllm-omni — install it separately from source if needed.
    "vllm-omni==0.18.0",
    "blake3>=1.0.0,<2.0.0",
+    "soundfile>=0.13.1",
+    "librosa>=0.10.0",
 ]

 sglang = [
@@ -411,6 +413,10 @@ module = [
    "pydantic_core",
    "aiconfigurator",
    "aiconfigurator.*",
+    "soundfile",
+    "soundfile.*",
+    "librosa",
+    "librosa.*",
 ]
 ignore_missing_imports = true


--- a/tests/serve/test_vllm_omni.py
+++ b/tests/serve/test_vllm_omni.py
@@ -102,6 +102,40 @@ class I2VPayload(VideoGenerationPayload):
        self.body["input_reference"] = path


+@dataclass
+class AudioSpeechPayload(BasePayload):
+    """Payload for /v1/audio/speech endpoint."""
+
+    endpoint: str = "/v1/audio/speech"
+    timeout: int = 300
+
+    def response_handler(self, response: Any) -> str:
+        response.raise_for_status()
+        content_type = response.headers.get("content-type", "")
+        if "audio" in content_type:
+            # Binary audio response
+            audio_bytes = response.content
+            assert len(audio_bytes) > 100, (
+                f"Audio response too small ({len(audio_bytes)} bytes), "
+                f"likely not valid audio"
+            )
+            return f"binary_audio_{len(audio_bytes)}_bytes"
+        # JSON response (error or url format)
+        result = response.json()
+        assert (
+            result.get("status") != "failed"
+        ), f"Audio generation failed: {result.get('error', 'unknown')}"
+        assert (
+            "data" in result
+        ), f"Missing 'data' in response. Keys: {list(result.keys())}"
+        assert len(result["data"]) > 0, "Empty data in audio response"
+        entry = result["data"][0]
+        if "url" in entry and entry["url"]:
+            return entry["url"]
+        assert entry.get("b64_json"), "Audio response b64_json is empty"
+        return "b64_audio_returned"
+
+
 @dataclass
 class VLLMOmniConfig(EngineConfig):
    """Configuration for vLLM-Omni test scenarios."""
@@ -205,6 +239,30 @@ vllm_omni_configs = {
            ),
        ],
    ),
+    "omni_audio": VLLMOmniConfig(
+        name="omni_audio",
+        directory=vllm_dir,
+        script_name="agg_omni_audio.sh",
+        marks=[
+            pytest.mark.gpu_1,
+            pytest.mark.pre_merge,
+            pytest.mark.timeout(1200),
+        ],
+        model="Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
+        request_payloads=[
+            AudioSpeechPayload(
+                body={
+                    "input": "Hello, this is a test of Dynamo audio generation.",
+                    "model": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
+                    "voice": "vivian",
+                    "language": "English",
+                },
+                repeat_count=1,
+                expected_response=[],
+                expected_log=[],
+            ),
+        ],
+    ),
    "omni_t2v": VLLMOmniConfig(
        name="omni_t2v",
        directory=vllm_dir,