"launch/vscode:/vscode.git/clone" did not exist on "a8e5328ed86719292c40f20d68024c13a6f900d5"
Unverified Commit cbbde3d0 authored by Zhongdongming Dai's avatar Zhongdongming Dai Committed by GitHub
Browse files

feat: add initial audio/TTS pipeline support for vLLM-omni backend (#7495)


Signed-off-by: default avatarZhongdongming Dai <zhongdongmin@nvidia.com>
parent 76c70f41
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use dynamo_runtime::protocols::annotated::AnnotationsProvider;
use serde::{Deserialize, Serialize};
use validator::Validate;
mod aggregator;
mod nvext;
pub use aggregator::DeltaAggregator;
pub use nvext::{NvExt, NvExtProvider};
/// Request for audio speech generation (/v1/audio/speech endpoint).
///
/// Follows vLLM-Omni's OpenAICreateSpeechRequest format with TTS-specific
/// parameters as top-level fields.
#[derive(Serialize, Deserialize, Validate, Debug, Clone)]
pub struct NvCreateAudioSpeechRequest {
/// The text to synthesize into speech (required)
pub input: String,
/// The TTS model to use
#[serde(skip_serializing_if = "Option::is_none")]
pub model: Option<String>,
/// Voice/speaker name (e.g., "vivian", "ryan", "aiden")
#[serde(skip_serializing_if = "Option::is_none")]
pub voice: Option<String>,
/// Output format: "wav", "mp3", "pcm", "flac", "aac", "opus"
#[serde(skip_serializing_if = "Option::is_none")]
pub response_format: Option<String>,
/// Speed factor (0.25-4.0, default: 1.0)
#[serde(skip_serializing_if = "Option::is_none")]
pub speed: Option<f64>,
// Qwen3-TTS specific parameters (top-level, matching vLLM-Omni)
/// TTS task type: "CustomVoice", "VoiceDesign", or "Base"
#[serde(skip_serializing_if = "Option::is_none")]
pub task_type: Option<String>,
/// Language: "Auto", "Chinese", "English", "Japanese", etc.
#[serde(skip_serializing_if = "Option::is_none")]
pub language: Option<String>,
/// Voice style/emotion instructions (for VoiceDesign)
#[serde(skip_serializing_if = "Option::is_none")]
pub instructions: Option<String>,
/// Reference audio URL or base64 (for voice cloning with Base task)
#[serde(skip_serializing_if = "Option::is_none")]
pub ref_audio: Option<String>,
/// Reference transcript (for voice cloning with Base task)
#[serde(skip_serializing_if = "Option::is_none")]
pub ref_text: Option<String>,
/// Maximum tokens to generate (default: 2048)
#[serde(skip_serializing_if = "Option::is_none")]
pub max_new_tokens: Option<i32>,
/// Optional user identifier
#[serde(skip_serializing_if = "Option::is_none")]
pub user: Option<String>,
/// NVIDIA extensions (reserved for future use)
#[serde(skip_serializing_if = "Option::is_none")]
pub nvext: Option<NvExt>,
}
/// Audio data in response
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct AudioData {
/// URL of the generated audio (if response_format is "url")
#[serde(skip_serializing_if = "Option::is_none")]
pub url: Option<String>,
/// Base64-encoded audio data
#[serde(skip_serializing_if = "Option::is_none")]
pub b64_json: Option<String>,
}
/// Response structure for audio speech generation
#[derive(Serialize, Deserialize, Validate, Debug, Clone)]
pub struct NvAudioSpeechResponse {
/// Unique identifier for the response
pub id: String,
/// Object type (always "audio.speech")
#[serde(default = "default_object_type")]
pub object: String,
/// Model used for generation
pub model: String,
/// Status of the generation ("completed", "failed", etc.)
#[serde(default = "default_status")]
pub status: String,
/// Progress percentage (0-100)
#[serde(default = "default_progress")]
pub progress: i32,
/// Unix timestamp of creation
pub created: i64,
/// Generated audio data
#[serde(default)]
pub data: Vec<AudioData>,
/// Error message if generation failed
#[serde(skip_serializing_if = "Option::is_none")]
pub error: Option<String>,
/// Inference time in seconds
#[serde(skip_serializing_if = "Option::is_none")]
pub inference_time_s: Option<f64>,
}
fn default_object_type() -> String {
"audio.speech".to_string()
}
fn default_status() -> String {
"completed".to_string()
}
fn default_progress() -> i32 {
100
}
impl NvAudioSpeechResponse {
pub fn empty() -> Self {
Self {
id: String::new(),
object: "audio.speech".to_string(),
model: String::new(),
status: "completed".to_string(),
progress: 100,
created: 0,
data: vec![],
error: None,
inference_time_s: None,
}
}
}
/// Implements `NvExtProvider` for `NvCreateAudioSpeechRequest`.
impl NvExtProvider for NvCreateAudioSpeechRequest {
fn nvext(&self) -> Option<&NvExt> {
self.nvext.as_ref()
}
}
/// Implements `AnnotationsProvider` for `NvCreateAudioSpeechRequest`.
impl AnnotationsProvider for NvCreateAudioSpeechRequest {
fn annotations(&self) -> Option<Vec<String>> {
self.nvext
.as_ref()
.and_then(|nvext| nvext.annotations.clone())
}
fn has_annotation(&self, annotation: &str) -> bool {
self.nvext
.as_ref()
.and_then(|nvext| nvext.annotations.as_ref())
.map(|annotations| annotations.contains(&annotation.to_string()))
.unwrap_or(false)
}
}
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use futures::{Stream, StreamExt};
use crate::types::Annotated;
use super::NvAudioSpeechResponse;
/// Aggregator for combining audio response deltas into a final response.
#[derive(Debug)]
pub struct DeltaAggregator {
response: Option<NvAudioSpeechResponse>,
error: Option<String>,
}
impl Default for DeltaAggregator {
fn default() -> Self {
Self::new()
}
}
impl DeltaAggregator {
pub fn new() -> Self {
DeltaAggregator {
response: None,
error: None,
}
}
/// Aggregates a stream of annotated audio responses into a final response.
pub async fn apply(
stream: impl Stream<Item = Annotated<NvAudioSpeechResponse>>,
) -> Result<NvAudioSpeechResponse, String> {
let aggregator = stream
.fold(DeltaAggregator::new(), |mut aggregator, delta| async move {
let delta = match delta.ok() {
Ok(delta) => delta,
Err(error) => {
aggregator.error = Some(error);
return aggregator;
}
};
if aggregator.error.is_none()
&& let Some(response) = delta.data
{
match &mut aggregator.response {
Some(existing) => {
existing.data.extend(response.data);
}
None => {
aggregator.response = Some(response);
}
}
}
aggregator
})
.await;
if let Some(error) = aggregator.error {
return Err(error);
}
Ok(aggregator
.response
.unwrap_or_else(NvAudioSpeechResponse::empty))
}
}
impl NvAudioSpeechResponse {
/// Aggregates an annotated stream of audio responses into a final response.
pub async fn from_annotated_stream(
stream: impl Stream<Item = Annotated<NvAudioSpeechResponse>>,
) -> Result<NvAudioSpeechResponse, String> {
DeltaAggregator::apply(stream).await
}
}
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use derive_builder::Builder;
use serde::{Deserialize, Serialize};
use utoipa::ToSchema;
use validator::{Validate, ValidationError};
pub trait NvExtProvider {
fn nvext(&self) -> Option<&NvExt>;
}
/// NVIDIA extensions to the Audio Speech API
#[derive(ToSchema, Serialize, Deserialize, Builder, Validate, Debug, Clone)]
#[validate(schema(function = "validate_nv_ext"))]
pub struct NvExt {
/// Annotations for SSE stream events
#[serde(default, skip_serializing_if = "Option::is_none")]
#[builder(default, setter(strip_option))]
pub annotations: Option<Vec<String>>,
/// Language: Auto, Chinese, English, Japanese, Korean, German, French, etc.
#[serde(skip_serializing_if = "Option::is_none")]
#[builder(default, setter(strip_option))]
pub language: Option<String>,
/// Task type: CustomVoice, VoiceDesign, or Base
#[serde(skip_serializing_if = "Option::is_none")]
#[builder(default, setter(strip_option))]
pub task_type: Option<String>,
/// Maximum number of tokens to generate (default: 2048)
#[serde(skip_serializing_if = "Option::is_none")]
#[builder(default, setter(strip_option))]
pub max_new_tokens: Option<i32>,
/// Reference audio URL or base64 data (for voice cloning)
#[serde(skip_serializing_if = "Option::is_none")]
#[builder(default, setter(strip_option))]
pub ref_audio: Option<String>,
/// Reference transcript (for voice cloning)
#[serde(skip_serializing_if = "Option::is_none")]
#[builder(default, setter(strip_option))]
pub ref_text: Option<String>,
/// Random seed for reproducibility
#[serde(skip_serializing_if = "Option::is_none")]
#[builder(default, setter(strip_option))]
pub seed: Option<i64>,
}
impl Default for NvExt {
fn default() -> Self {
NvExt::builder().build().unwrap()
}
}
impl NvExt {
pub fn builder() -> NvExtBuilder {
NvExtBuilder::default()
}
}
fn validate_nv_ext(_nv_ext: &NvExt) -> Result<(), ValidationError> {
Ok(())
}
impl NvExtBuilder {
pub fn add_annotation(&mut self, annotation: impl Into<String>) -> &mut Self {
self.annotations
.get_or_insert_with(|| Some(vec![]))
.as_mut()
.expect("annotations should always be Some(Vec)")
.push(annotation.into());
self
}
}
......@@ -96,6 +96,20 @@ pub mod openai {
pub type OpenAIVideosStreamingEngine =
ServerStreamingEngine<NvCreateVideoRequest, Annotated<NvVideosResponse>>;
}
pub mod audios {
use super::*;
pub use protocols::openai::audios::{NvAudioSpeechResponse, NvCreateAudioSpeechRequest};
/// A [`UnaryEngine`] implementation for the Audio Speech API
pub type OpenAIAudiosUnaryEngine =
UnaryEngine<NvCreateAudioSpeechRequest, NvAudioSpeechResponse>;
/// A [`ServerStreamingEngine`] implementation for the Audio Speech API
pub type OpenAIAudiosStreamingEngine =
ServerStreamingEngine<NvCreateAudioSpeechRequest, Annotated<NvAudioSpeechResponse>>;
}
}
pub mod generic {
......
......@@ -222,6 +222,7 @@ fn compute_index(endpoint: &Endpoint, request_type: &RequestType, status: &Statu
Endpoint::Tensor => todo!(),
Endpoint::Images => todo!(),
Endpoint::Videos => todo!(),
Endpoint::Audios => todo!(),
};
let request_type = match request_type {
......
......@@ -56,6 +56,8 @@ vllm = [
# not include vllm-omni — install it separately from source if needed.
"vllm-omni==0.18.0",
"blake3>=1.0.0,<2.0.0",
"soundfile>=0.13.1",
"librosa>=0.10.0",
]
sglang = [
......@@ -411,6 +413,10 @@ module = [
"pydantic_core",
"aiconfigurator",
"aiconfigurator.*",
"soundfile",
"soundfile.*",
"librosa",
"librosa.*",
]
ignore_missing_imports = true
......
......@@ -102,6 +102,40 @@ class I2VPayload(VideoGenerationPayload):
self.body["input_reference"] = path
@dataclass
class AudioSpeechPayload(BasePayload):
"""Payload for /v1/audio/speech endpoint."""
endpoint: str = "/v1/audio/speech"
timeout: int = 300
def response_handler(self, response: Any) -> str:
response.raise_for_status()
content_type = response.headers.get("content-type", "")
if "audio" in content_type:
# Binary audio response
audio_bytes = response.content
assert len(audio_bytes) > 100, (
f"Audio response too small ({len(audio_bytes)} bytes), "
f"likely not valid audio"
)
return f"binary_audio_{len(audio_bytes)}_bytes"
# JSON response (error or url format)
result = response.json()
assert (
result.get("status") != "failed"
), f"Audio generation failed: {result.get('error', 'unknown')}"
assert (
"data" in result
), f"Missing 'data' in response. Keys: {list(result.keys())}"
assert len(result["data"]) > 0, "Empty data in audio response"
entry = result["data"][0]
if "url" in entry and entry["url"]:
return entry["url"]
assert entry.get("b64_json"), "Audio response b64_json is empty"
return "b64_audio_returned"
@dataclass
class VLLMOmniConfig(EngineConfig):
"""Configuration for vLLM-Omni test scenarios."""
......@@ -205,6 +239,30 @@ vllm_omni_configs = {
),
],
),
"omni_audio": VLLMOmniConfig(
name="omni_audio",
directory=vllm_dir,
script_name="agg_omni_audio.sh",
marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.timeout(1200),
],
model="Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
request_payloads=[
AudioSpeechPayload(
body={
"input": "Hello, this is a test of Dynamo audio generation.",
"model": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
"voice": "vivian",
"language": "English",
},
repeat_count=1,
expected_response=[],
expected_log=[],
),
],
),
"omni_t2v": VLLMOmniConfig(
name="omni_t2v",
directory=vllm_dir,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment