Commit c1cacde6 authored by weishb's avatar weishb
Browse files

vllm-omni_0.15.0.rc1+fix1 first commit

parent 35607782
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Pytest configuration and fixtures for vllm-omni tests.
"""
from typing import Any
import pytest
from vllm import TextPrompt
from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
from tests.conftest import _run_post_test_cleanup, _run_pre_test_cleanup
from vllm_omni.entrypoints.omni import Omni
from vllm_omni.inputs.data import OmniSamplingParams
from vllm_omni.outputs import OmniRequestOutput
PromptAudioInput = list[tuple[Any, int]] | tuple[Any, int] | None
PromptImageInput = list[Any] | Any | None
PromptVideoInput = list[Any] | Any | None
class OmniRunner:
"""
Test runner for Omni models.
"""
def __init__(
self,
model_name: str,
seed: int = 42,
stage_init_timeout: int = 300,
batch_timeout: int = 10,
init_timeout: int = 300,
shm_threshold_bytes: int = 65536,
log_stats: bool = False,
stage_configs_path: str | None = None,
**kwargs,
) -> None:
"""
Initialize an OmniRunner for testing.
Args:
model_name: The model name or path
seed: Random seed for reproducibility
stage_init_timeout: Timeout for initializing a single stage in seconds
batch_timeout: Timeout for batching in seconds
init_timeout: Timeout for initializing stages in seconds
shm_threshold_bytes: Threshold for using shared memory
log_stats: Enable detailed statistics logging
stage_configs_path: Optional path to YAML stage config file
**kwargs: Additional arguments passed to Omni
"""
cleanup_dist_env_and_memory()
_run_pre_test_cleanup(enable_force=True)
_run_post_test_cleanup(enable_force=True)
self.model_name = model_name
self.seed = seed
self.omni = Omni(
model=model_name,
log_stats=log_stats,
stage_init_timeout=stage_init_timeout,
batch_timeout=batch_timeout,
init_timeout=init_timeout,
shm_threshold_bytes=shm_threshold_bytes,
stage_configs_path=stage_configs_path,
**kwargs,
)
def get_default_sampling_params_list(self) -> list[OmniSamplingParams]:
"""
Get a list of default sampling parameters for all stages.
Returns:
List of SamplingParams with default decoding for each stage
"""
return [st.default_sampling_params for st in self.omni.stage_list]
def get_omni_inputs(
self,
prompts: list[str] | str,
system_prompt: str | None = None,
audios: PromptAudioInput = None,
images: PromptImageInput = None,
videos: PromptVideoInput = None,
mm_processor_kwargs: dict[str, Any] | None = None,
modalities: list[str] | None = None,
) -> list[TextPrompt]:
"""
Construct Omni input format from prompts and multimodal data.
Args:
prompts: Text prompt(s) - either a single string or list of strings
system_prompt: Optional system prompt (defaults to Qwen system prompt)
audios: Audio input(s) - tuple of (audio_array, sample_rate) or list of tuples
images: Image input(s) - PIL Image or list of PIL Images
videos: Video input(s) - numpy array or list of numpy arrays
mm_processor_kwargs: Optional processor kwargs (e.g., use_audio_in_video)
Returns:
List of prompt dictionaries suitable for Omni.generate()
"""
if system_prompt is None:
system_prompt = (
"You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
"Group, capable of perceiving auditory and visual inputs, as well as "
"generating text and speech."
)
video_padding_token = "<|VIDEO|>"
image_padding_token = "<|IMAGE|>"
audio_padding_token = "<|AUDIO|>"
if self.model_name == "Qwen/Qwen3-Omni-30B-A3B-Instruct":
video_padding_token = "<|video_pad|>"
image_padding_token = "<|image_pad|>"
audio_padding_token = "<|audio_pad|>"
if isinstance(prompts, str):
prompts = [prompts]
def _normalize_mm_input(mm_input, num_prompts):
if mm_input is None:
return [None] * num_prompts
if isinstance(mm_input, list):
if len(mm_input) != num_prompts:
raise ValueError(
f"Multimodal input list length ({len(mm_input)}) must match prompts length ({num_prompts})"
)
return mm_input
return [mm_input] * num_prompts
num_prompts = len(prompts)
audios_list = _normalize_mm_input(audios, num_prompts)
images_list = _normalize_mm_input(images, num_prompts)
videos_list = _normalize_mm_input(videos, num_prompts)
omni_inputs = []
for i, prompt_text in enumerate(prompts):
user_content = ""
multi_modal_data = {}
audio = audios_list[i]
if audio is not None:
if isinstance(audio, list):
for _ in audio:
user_content += f"<|audio_bos|>{audio_padding_token}<|audio_eos|>"
multi_modal_data["audio"] = audio
else:
user_content += f"<|audio_bos|>{audio_padding_token}<|audio_eos|>"
multi_modal_data["audio"] = audio
image = images_list[i]
if image is not None:
if isinstance(image, list):
for _ in image:
user_content += f"<|vision_bos|>{image_padding_token}<|vision_eos|>"
multi_modal_data["image"] = image
else:
user_content += f"<|vision_bos|>{image_padding_token}<|vision_eos|>"
multi_modal_data["image"] = image
video = videos_list[i]
if video is not None:
if isinstance(video, list):
for _ in video:
user_content += f"<|vision_bos|>{video_padding_token}<|vision_eos|>"
multi_modal_data["video"] = video
else:
user_content += f"<|vision_bos|>{video_padding_token}<|vision_eos|>"
multi_modal_data["video"] = video
user_content += prompt_text
full_prompt = (
f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
f"<|im_start|>user\n{user_content}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
input_dict: TextPrompt = {"prompt": full_prompt}
if multi_modal_data:
input_dict["multi_modal_data"] = multi_modal_data
if modalities:
input_dict["modalities"] = modalities
if mm_processor_kwargs:
input_dict["mm_processor_kwargs"] = mm_processor_kwargs
omni_inputs.append(input_dict)
return omni_inputs
def generate(
self,
prompts: list[TextPrompt],
sampling_params_list: list[OmniSamplingParams] | None = None,
) -> list[OmniRequestOutput]:
"""
Generate outputs for the given prompts.
Args:
prompts: List of prompt dictionaries with 'prompt' and optionally
'multi_modal_data' keys
sampling_params_list: List of sampling parameters for each stage.
If None, uses default parameters.
Returns:
List of OmniRequestOutput objects from stages with final_output=True
"""
if sampling_params_list is None:
sampling_params_list = self.get_default_sampling_params_list()
return self.omni.generate(prompts, sampling_params_list)
def generate_multimodal(
self,
prompts: list[str] | str,
sampling_params_list: list[OmniSamplingParams] | None = None,
system_prompt: str | None = None,
audios: PromptAudioInput = None,
images: PromptImageInput = None,
videos: PromptVideoInput = None,
mm_processor_kwargs: dict[str, Any] | None = None,
modalities: list[str] | None = None,
) -> list[OmniRequestOutput]:
"""
Convenience method to generate with multimodal inputs.
Args:
prompts: Text prompt(s)
sampling_params_list: List of sampling parameters for each stage
system_prompt: Optional system prompt
audios: Audio input(s)
images: Image input(s)
videos: Video input(s)
mm_processor_kwargs: Optional processor kwargs
Returns:
List of OmniRequestOutput objects from stages with final_output=True
"""
omni_inputs = self.get_omni_inputs(
prompts=prompts,
system_prompt=system_prompt,
audios=audios,
images=images,
videos=videos,
mm_processor_kwargs=mm_processor_kwargs,
modalities=modalities,
)
return self.generate(omni_inputs, sampling_params_list)
def generate_audio(
self,
prompts: list[str] | str,
sampling_params_list: list[OmniSamplingParams] | None = None,
system_prompt: str | None = None,
audios: PromptAudioInput = None,
mm_processor_kwargs: dict[str, Any] | None = None,
) -> list[OmniRequestOutput]:
"""
Convenience method to generate with multimodal inputs.
Args:
prompts: Text prompt(s)
sampling_params_list: List of sampling parameters for each stage
system_prompt: Optional system prompt
audios: Audio input(s)
mm_processor_kwargs: Optional processor kwargs
Returns:
List of OmniRequestOutput objects from stages with final_output=True
"""
omni_inputs = self.get_omni_inputs(
prompts=prompts,
system_prompt=system_prompt,
audios=audios,
mm_processor_kwargs=mm_processor_kwargs,
)
return self.generate(omni_inputs, sampling_params_list)
def generate_video(
self,
prompts: list[str] | str,
sampling_params_list: list[OmniSamplingParams] | None = None,
system_prompt: str | None = None,
videos: PromptVideoInput = None,
mm_processor_kwargs: dict[str, Any] | None = None,
) -> list[OmniRequestOutput]:
"""
Convenience method to generate with multimodal inputs.
Args:
prompts: Text prompt(s)
sampling_params_list: List of sampling parameters for each stage
system_prompt: Optional system prompt
videos: Video input(s)
mm_processor_kwargs: Optional processor kwargs
Returns:
List of OmniRequestOutput objects from stages with final_output=True
"""
omni_inputs = self.get_omni_inputs(
prompts=prompts,
system_prompt=system_prompt,
videos=videos,
mm_processor_kwargs=mm_processor_kwargs,
)
return self.generate(omni_inputs, sampling_params_list)
def generate_image(
self,
prompts: list[str] | str,
sampling_params_list: list[OmniSamplingParams] | None = None,
system_prompt: str | None = None,
images: PromptImageInput = None,
mm_processor_kwargs: dict[str, Any] | None = None,
) -> list[OmniRequestOutput]:
"""
Convenience method to generate with multimodal inputs.
Args:
prompts: Text prompt(s)
sampling_params_list: List of sampling parameters for each stage
system_prompt: Optional system prompt
images: Image input(s)
mm_processor_kwargs: Optional processor kwargs
Returns:
List of OmniRequestOutput objects from stages with final_output=True
"""
omni_inputs = self.get_omni_inputs(
prompts=prompts,
system_prompt=system_prompt,
images=images,
mm_processor_kwargs=mm_processor_kwargs,
)
return self.generate(omni_inputs, sampling_params_list)
def __enter__(self):
"""Context manager entry."""
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit - cleanup resources."""
self.close()
del self.omni
cleanup_dist_env_and_memory()
_run_post_test_cleanup(enable_force=True)
def close(self):
"""Close and cleanup the Omni instance."""
if hasattr(self.omni, "close"):
self.omni.close()
@pytest.fixture(scope="session")
def omni_runner():
return OmniRunner
# stage config for running BAGEL with Mooncake connector for CI e2e tests.
# This config is optimized for single GPU tests with Mooncake inter-stage communication.
stage_args:
- stage_id: 0
stage_type: llm
runtime:
devices: "0"
max_batch_size: 1
engine_args:
model_stage: thinker
model_arch: BagelForConditionalGeneration
worker_type: ar
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
gpu_memory_utilization: 0.35
enforce_eager: true
trust_remote_code: true
engine_output_type: text
distributed_executor_backend: mp
enable_prefix_caching: false
max_num_batched_tokens: 32768
tensor_parallel_size: 1
omni_kv_config:
need_send_cache: true
kv_transfer_criteria:
type: prefill_finished
final_output: true
final_output_type: text
is_comprehension: true
default_sampling_params:
temperature: 0.4
top_p: 0.9
top_k: 1
max_tokens: 2048
seed: 52
detokenize: true
repetition_penalty: 1.05
output_connectors:
to_stage_1: mooncake_connector
- stage_id: 1
stage_type: diffusion
runtime:
devices: "0"
max_batch_size: 1
engine_args:
model_stage: dit
gpu_memory_utilization: 0.55
enforce_eager: true
trust_remote_code: true
engine_output_type: image
distributed_executor_backend: mp
enable_prefix_caching: false
max_num_batched_tokens: 32768
tensor_parallel_size: 1
omni_kv_config:
need_recv_cache: true
engine_input_source: [0]
final_output: true
final_output_type: image
is_comprehension: false
default_sampling_params:
seed: 52
input_connectors:
from_stage_0: mooncake_connector
# Top-level runtime config with Mooncake connector
runtime:
enabled: true
defaults:
window_size: -1
max_inflight: 1
connectors:
mooncake_connector:
name: MooncakeConnector
extra:
host: "${MOONCAKE_HOST}"
metadata_server: "http://${MOONCAKE_HOST}:${MOONCAKE_HTTP_PORT}/metadata"
master: "${MOONCAKE_HOST}:${MOONCAKE_RPC_PORT}"
segment: 64000000
localbuf: 64000000
proto: tcp
edges:
- from: 0
to: 1
window_size: -1
# stage config for running BAGEL with SharedMemory connector for CI e2e tests.
# This config is optimized for single GPU tests with SharedMemory inter-stage communication.
stage_args:
- stage_id: 0
stage_type: llm
runtime:
devices: "0"
max_batch_size: 1
engine_args:
model_stage: thinker
model_arch: BagelForConditionalGeneration
worker_type: ar
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
gpu_memory_utilization: 0.35
enforce_eager: true
trust_remote_code: true
engine_output_type: text
distributed_executor_backend: "mp"
enable_prefix_caching: false
max_num_batched_tokens: 32768
tensor_parallel_size: 1
omni_kv_config:
need_send_cache: true
kv_transfer_criteria:
type: prefill_finished #or special token generated
final_output: true
final_output_type: text
is_comprehension: true
default_sampling_params:
temperature: 0.4
top_p: 0.9
top_k: 1
max_tokens: 2048
seed: 52
detokenize: True
repetition_penalty: 1.05
- stage_id: 1
stage_type: diffusion
runtime:
devices: "0"
max_batch_size: 1
engine_args:
model_stage: dit
gpu_memory_utilization: 0.55
enforce_eager: true
trust_remote_code: true
engine_output_type: image
distributed_executor_backend: "mp"
enable_prefix_caching: false
max_num_batched_tokens: 32768
tensor_parallel_size: 1
omni_kv_config:
need_recv_cache: true
engine_input_source: [0]
final_output: true
final_output_type: image
is_comprehension: false
default_sampling_params:
seed: 52
# Runtime edges
runtime:
enabled: true
defaults:
window_size: -1
max_inflight: 1
# Distributed connectors configuration (optional)
# More connectors will be supported in the future.
connectors:
shared_memory_connector:
name: SharedMemoryConnector
extra:
shm_threshold_bytes: 65536 # 64KB threshold
edges:
- from: 0
to: 1
window_size: -1
# stage config for running qwen2.5-omni with architecture of OmniLLM.
# This config is optimized for CI e2e tests.
stage_args:
- stage_id: 0
runtime:
process: true # Run this stage in a separate process
devices: "0"
max_batch_size: 1
engine_args:
model_stage: thinker
model_arch: Qwen2_5OmniForConditionalGeneration
worker_type: ar
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
max_model_len: 896
max_num_batched_tokens: 896
max_num_seqs: 1
gpu_memory_utilization: 0.8
skip_mm_profiling: true
enforce_eager: true # Now we only support eager mode
trust_remote_code: true
engine_output_type: latent
enable_prefix_caching: false
is_comprehension: true
final_output: true
final_output_type: text
default_sampling_params:
temperature: 0.0
top_p: 1.0
top_k: -1
max_tokens: 128
seed: 42
detokenize: True
repetition_penalty: 1.1
- stage_id: 1
runtime:
process: true
devices: "1"
max_batch_size: 1
engine_args:
model_stage: talker
model_arch: Qwen2_5OmniForConditionalGeneration
worker_type: ar
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
max_model_len: 896
max_num_batched_tokens: 896
max_num_seqs: 1
gpu_memory_utilization: 0.8
skip_mm_profiling: true
enforce_eager: true
trust_remote_code: true
enable_prefix_caching: false
engine_output_type: latent
engine_input_source: [0]
custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker
default_sampling_params:
temperature: 0.9
top_p: 0.8
top_k: 40
max_tokens: 128
seed: 42
detokenize: True
repetition_penalty: 1.05
stop_token_ids: [8294]
- stage_id: 2
runtime:
process: true
devices: "0" # Example: use a different GPU than the previous stage; use "0" if single GPU
max_batch_size: 1
engine_args:
model_stage: code2wav
model_arch: Qwen2_5OmniForConditionalGeneration
worker_type: generation
scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
gpu_memory_utilization: 0.15
enforce_eager: true
trust_remote_code: true
enable_prefix_caching: false
engine_output_type: audio
engine_input_source: [1]
final_output: true
final_output_type: audio
default_sampling_params:
temperature: 0.0
top_p: 1.0
top_k: -1
max_tokens: 128
seed: 42
detokenize: True
repetition_penalty: 1.1
# Top-level runtime config (concise): default windows and stage edges
runtime:
enabled: true
defaults:
window_size: -1 # Simplified: trigger downstream only after full upstream completion
max_inflight: 1 # Simplified: process serially within each stage
edges:
- from: 0 # thinker → talker: trigger only after receiving full input (-1)
to: 1
window_size: -1
- from: 1 # talker → code2wav: trigger only after receiving full input (-1)
to: 2
window_size: -1
# stage config for running qwen2.5-omni with architecture of OmniLLM.
# The following config has been verified on 2x 24GB GPU (L4/RTX3090/RTX4090).
# This config is optimized for CI e2e tests.
stage_args:
- stage_id: 0
runtime:
process: true # Run this stage in a separate process
devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
max_batch_size: 1
engine_args:
model_stage: thinker
model_arch: Qwen2_5OmniForConditionalGeneration
worker_type: ar
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
max_model_len: 896
max_num_batched_tokens: 896
max_num_seqs: 1
gpu_memory_utilization: 0.8
skip_mm_profiling: true
enforce_eager: true # Now we only support eager mode
trust_remote_code: true
engine_output_type: latent
enable_prefix_caching: false
is_comprehension: true
final_output: true
final_output_type: text
default_sampling_params:
temperature: 0.0
top_p: 1.0
top_k: -1
max_tokens: 128
seed: 42
detokenize: True
repetition_penalty: 1.1
- stage_id: 1
runtime:
process: true
devices: "1"
max_batch_size: 1
engine_args:
model_stage: talker
model_arch: Qwen2_5OmniForConditionalGeneration
worker_type: ar
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
max_model_len: 896
max_num_batched_tokens: 896
max_num_seqs: 1
gpu_memory_utilization: 0.8
skip_mm_profiling: true
enforce_eager: true
trust_remote_code: true
enable_prefix_caching: false
engine_output_type: latent
engine_input_source: [0]
custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker
default_sampling_params:
temperature: 0.9
top_p: 0.8
top_k: 40
max_tokens: 128
seed: 42
detokenize: True
repetition_penalty: 1.05
stop_token_ids: [8294]
- stage_id: 2
runtime:
process: true
devices: "0" # Example: use a different GPU than the previous stage; use "0" if single GPU
max_batch_size: 1
engine_args:
model_stage: code2wav
model_arch: Qwen2_5OmniForConditionalGeneration
worker_type: generation
scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
gpu_memory_utilization: 0.15
enforce_eager: true
trust_remote_code: true
enable_prefix_caching: false
engine_output_type: audio
max_num_batched_tokens: 4069
engine_input_source: [1]
final_output: true
final_output_type: audio
default_sampling_params:
temperature: 0.0
top_p: 1.0
top_k: -1
max_tokens: 128
seed: 42
detokenize: True
repetition_penalty: 1.1
# Top-level runtime config (concise): default windows and stage edges
runtime:
enabled: true
defaults:
window_size: -1 # Simplified: trigger downstream only after full upstream completion
max_inflight: 1 # Simplified: process serially within each stage
edges:
- from: 0 # thinker → talker: trigger only after receiving full input (-1)
to: 1
window_size: -1
- from: 1 # talker → code2wav: trigger only after receiving full input (-1)
to: 2
window_size: -1
# Stage config for running Qwen3-Omni-MoE with 3-stage architecture
# Stage 0: Thinker (multimodal understanding + text generation)
# Stage 1: Talker (text embeddings → 16-layer RVQ codec codes)
# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform)
# The following config has been verified on 2x H100-80G GPUs.
stage_args:
- stage_id: 0
runtime:
devices: "0"
max_batch_size: 1
engine_args:
model_stage: thinker
model_arch: Qwen3OmniMoeForConditionalGeneration
worker_type: ar
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
gpu_memory_utilization: 0.9
enforce_eager: false
trust_remote_code: true
engine_output_type: latent # Output hidden states for talker
distributed_executor_backend: "mp"
enable_prefix_caching: false
hf_config_name: thinker_config
tensor_parallel_size: 1
load_format: dummy
final_output: true
final_output_type: text
is_comprehension: true
default_sampling_params:
temperature: 0.4
top_p: 0.9
top_k: 1
max_tokens: 100
seed: 42
detokenize: True
repetition_penalty: 1.05
- stage_id: 1
runtime:
devices: "1"
max_batch_size: 1
engine_args:
model_stage: talker
model_arch: Qwen3OmniMoeForConditionalGeneration
worker_type: ar
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
gpu_memory_utilization: 0.6
enforce_eager: true
trust_remote_code: true
engine_output_type: latent # Output codec codes for code2wav
# tensor_parallel_size: 2
enable_prefix_caching: false
distributed_executor_backend: "mp"
hf_config_name: talker_config
load_format: dummy
engine_input_source: [0]
custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker
# final_output: true
# final_output_type: text
default_sampling_params:
temperature: 0.9
top_k: 50
max_tokens: 100
seed: 42
detokenize: False
repetition_penalty: 1.05
stop_token_ids: [2150]
- stage_id: 2
runtime:
devices: "1"
max_batch_size: 1
engine_args:
model_stage: code2wav
model_arch: Qwen3OmniMoeForConditionalGeneration
worker_type: generation
scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
enforce_eager: true
trust_remote_code: true
enable_prefix_caching: false
engine_output_type: audio # Final output: audio waveform
gpu_memory_utilization: 0.1
distributed_executor_backend: "mp"
max_num_batched_tokens: 1000000
hf_config_name: thinker_config
load_format: dummy
async_scheduling: false
engine_input_source: [1]
custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav
final_output: true
final_output_type: audio
default_sampling_params:
temperature: 0.0
top_p: 1.0
top_k: -1
max_tokens: 200
seed: 42
detokenize: True
repetition_penalty: 1.1
# stage config for running qwen2.5-omni with architecture of OmniLLM.
# The following config has been verified on 2x 24GB GPU (L4/RTX3090/RTX4090).
# This config is optimized for CI e2e tests.
stage_args:
- stage_id: 0
runtime:
process: true # Run this stage in a separate process
devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
max_batch_size: 1
engine_args:
model_stage: thinker
model_arch: Qwen2_5OmniForConditionalGeneration
worker_type: ar
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
max_model_len: 896
max_num_batched_tokens: 896
max_num_seqs: 1
gpu_memory_utilization: 0.8
skip_mm_profiling: true
enforce_eager: true # Now we only support eager mode
trust_remote_code: true
engine_output_type: latent
enable_prefix_caching: false
is_comprehension: true
final_output: true
final_output_type: text
default_sampling_params:
temperature: 0.0
top_p: 1.0
top_k: -1
max_tokens: 128
seed: 42
detokenize: True
repetition_penalty: 1.1
- stage_id: 1
runtime:
process: true
devices: "1"
max_batch_size: 1
engine_args:
model_stage: talker
model_arch: Qwen2_5OmniForConditionalGeneration
worker_type: ar
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
max_model_len: 896
max_num_batched_tokens: 896
max_num_seqs: 1
gpu_memory_utilization: 0.8
skip_mm_profiling: true
enforce_eager: true
trust_remote_code: true
enable_prefix_caching: false
engine_output_type: latent
engine_input_source: [0]
custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker
default_sampling_params:
temperature: 0.9
top_p: 0.8
top_k: 40
max_tokens: 128
seed: 42
detokenize: True
repetition_penalty: 1.05
stop_token_ids: [8294]
- stage_id: 2
runtime:
process: true
devices: "0" # Example: use a different GPU than the previous stage; use "0" if single GPU
max_batch_size: 1
engine_args:
model_stage: code2wav
model_arch: Qwen2_5OmniForConditionalGeneration
worker_type: generation
scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
gpu_memory_utilization: 0.15
enforce_eager: true
trust_remote_code: true
enable_prefix_caching: false
engine_output_type: audio
engine_input_source: [1]
final_output: true
final_output_type: audio
default_sampling_params:
temperature: 0.0
top_p: 1.0
top_k: -1
max_tokens: 128
seed: 42
detokenize: True
repetition_penalty: 1.1
# Top-level runtime config (concise): default windows and stage edges
runtime:
enabled: true
defaults:
window_size: -1 # Simplified: trigger downstream only after full upstream completion
max_inflight: 1 # Simplified: process serially within each stage
edges:
- from: 0 # thinker → talker: trigger only after receiving full input (-1)
to: 1
window_size: -1
- from: 1 # talker → code2wav: trigger only after receiving full input (-1)
to: 2
window_size: -1
# Stage config for running Qwen3-Omni-MoE with 3-stage architecture
# Stage 0: Thinker (multimodal understanding + text generation)
# Stage 1: Talker (text embeddings → 16-layer RVQ codec codes)
# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform)
# The following config has been verified on 2x H100-80G GPUs.
stage_args:
- stage_id: 0
runtime:
devices: "0"
max_batch_size: 1
engine_args:
model_stage: thinker
model_arch: Qwen3OmniMoeForConditionalGeneration
worker_type: ar
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
gpu_memory_utilization: 0.9
enforce_eager: false
trust_remote_code: true
engine_output_type: latent # Output hidden states for talker
distributed_executor_backend: "mp"
enable_prefix_caching: false
hf_config_name: thinker_config
tensor_parallel_size: 1
load_format: dummy
final_output: true
final_output_type: text
is_comprehension: true
default_sampling_params:
temperature: 0.4
top_p: 0.9
top_k: 1
max_tokens: 100
seed: 42
detokenize: True
repetition_penalty: 1.05
- stage_id: 1
runtime:
devices: "1"
max_batch_size: 1
engine_args:
model_stage: talker
model_arch: Qwen3OmniMoeForConditionalGeneration
worker_type: ar
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
gpu_memory_utilization: 0.6
enforce_eager: true
trust_remote_code: true
engine_output_type: latent # Output codec codes for code2wav
# tensor_parallel_size: 2
enable_prefix_caching: false
distributed_executor_backend: "mp"
hf_config_name: talker_config
load_format: dummy
engine_input_source: [0]
custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker
# final_output: true
# final_output_type: text
default_sampling_params:
temperature: 0.9
top_k: 50
max_tokens: 100
seed: 42
detokenize: False
repetition_penalty: 1.05
stop_token_ids: [2150]
- stage_id: 2
runtime:
devices: "1"
max_batch_size: 1
engine_args:
model_stage: code2wav
model_arch: Qwen3OmniMoeForConditionalGeneration
worker_type: generation
scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
enforce_eager: true
trust_remote_code: true
enable_prefix_caching: false
engine_output_type: audio # Final output: audio waveform
gpu_memory_utilization: 0.1
distributed_executor_backend: "mp"
max_num_batched_tokens: 1000000
hf_config_name: thinker_config
load_format: dummy
async_scheduling: false
engine_input_source: [1]
custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav
final_output: true
final_output_type: audio
default_sampling_params:
temperature: 0.0
top_p: 1.0
top_k: -1
max_tokens: 200
seed: 42
detokenize: True
repetition_penalty: 1.1
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
End-to-end test for Bagel text2img generation.
This test validates that the Bagel model generates images that match
expected reference pixel values within a ±5 tolerance.
Equivalent to running:
python3 examples/offline_inference/bagel/end2end.py \
--prompts "A futuristic city skyline at twilight, cyberpunk style" \
--modality text2img --step 15
"""
import os
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1"
import signal
import socket
import subprocess
import tempfile
import time
from pathlib import Path
from typing import Any
import pytest
from PIL import Image
from tests.utils import hardware_test
from vllm_omni.entrypoints.omni import Omni
# Reference pixel data extracted from the known-good output image
# Each entry contains (x, y) position and expected (R, G, B) values
# "Generated with seed=52, num_inference_steps=15,
# prompt='A futuristic city skyline at twilight, cyberpunk style'"
REFERENCE_PIXELS = [
{"position": (100, 100), "rgb": (68, 107, 134)},
{"position": (400, 50), "rgb": (95, 139, 166)},
{"position": (700, 100), "rgb": (99, 122, 151)},
{"position": (150, 400), "rgb": (111, 125, 153)},
{"position": (512, 512), "rgb": (97, 107, 131)},
{"position": (700, 400), "rgb": (48, 64, 98)},
{"position": (100, 700), "rgb": (79, 63, 84)},
{"position": (400, 700), "rgb": (40, 58, 79)},
{"position": (700, 700), "rgb": (60, 75, 103)},
{"position": (256, 256), "rgb": (97, 128, 156)},
]
# Maximum allowed difference per color channel
PIXEL_TOLERANCE = 5
# Default test prompt
DEFAULT_PROMPT = "<|im_start|>A futuristic city skyline at twilight, cyberpunk style<|im_end|>"
def _find_free_port() -> int:
"""Find and return a free ephemeral port by binding to port 0."""
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(("127.0.0.1", 0))
s.listen(1)
port = s.getsockname()[1]
return port
def _configure_sampling_params(omni: Omni, max_tokens: int = 1, num_inference_steps: int = 15) -> list:
"""Configure sampling parameters for Bagel text2img generation.
Args:
omni: The Omni instance to get default params from.
max_tokens: Maximum tokens for the first stage.
num_inference_steps: Number of inference steps for the diffusion stage.
Returns:
Configured sampling params list.
"""
params_list = omni.default_sampling_params_list
params_list[0].max_tokens = max_tokens # type: ignore
if len(params_list) > 1:
params_list[1].num_inference_steps = num_inference_steps # type: ignore
return params_list
def _extract_generated_image(omni_outputs: list) -> Image.Image | None:
"""Extract the generated image from Omni outputs.
Args:
omni_outputs: List of outputs from omni.generate().
Returns:
The first generated PIL Image, or None if no image found.
"""
for req_output in omni_outputs:
if images := getattr(req_output, "images", None):
return images[0]
if hasattr(req_output, "request_output") and req_output.request_output:
for stage_out in req_output.request_output:
if hasattr(stage_out, "images") and stage_out.images:
return stage_out.images[0]
return None
def _validate_pixels(
image: Image.Image,
reference_pixels: list[dict[str, Any]] = REFERENCE_PIXELS,
tolerance: int = PIXEL_TOLERANCE,
) -> None:
"""Validate that image pixels match expected reference values.
Args:
image: The PIL Image to validate.
reference_pixels: List of dicts with 'position' (x, y) and 'rgb' (R, G, B).
tolerance: Maximum allowed difference per color channel.
Raises:
AssertionError: If any pixel differs beyond tolerance.
"""
for ref in reference_pixels:
x, y = ref["position"]
expected = ref["rgb"]
actual = image.getpixel((x, y))[:3]
assert all(abs(a - e) <= tolerance for a, e in zip(actual, expected)), (
f"Pixel mismatch at ({x}, {y}): expected {expected}, got {actual}"
)
def _generate_bagel_image(omni: Omni, prompt: str = DEFAULT_PROMPT) -> Image.Image:
"""Generate an image using Bagel model with configured parameters.
Args:
omni: The Omni instance to use for generation.
prompt: The text prompt for image generation.
Returns:
The generated PIL Image.
Raises:
AssertionError: If no image is generated or size is incorrect.
"""
params_list = _configure_sampling_params(omni)
omni_outputs = list(
omni.generate(
prompts=[{"prompt": prompt, "modalities": ["image"]}],
sampling_params_list=params_list,
)
)
generated_image = _extract_generated_image(omni_outputs)
assert generated_image is not None, "No images generated"
assert generated_image.size == (1024, 1024), f"Expected 1024x1024, got {generated_image.size}"
return generated_image
@pytest.mark.core_model
@pytest.mark.diffusion
@hardware_test(res={"cuda": "H100"})
def test_bagel_text2img_shared_memory_connector():
"""Test Bagel text2img with shared memory connector."""
config_path = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml")
omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=config_path, stage_init_timeout=300)
try:
generated_image = _generate_bagel_image(omni)
_validate_pixels(generated_image)
finally:
omni.close()
def _wait_for_port(host: str, port: int, timeout: int = 30) -> bool:
"""Wait for a port to become available.
Args:
host: The host address.
port: The port number.
timeout: Maximum seconds to wait.
Returns:
True if port becomes available, False otherwise.
"""
for _ in range(timeout):
try:
with socket.create_connection((host, port), timeout=1):
return True
except (TimeoutError, ConnectionRefusedError):
time.sleep(1)
return False
def _cleanup_mooncake_processes(timeout_secs: int = 5) -> None:
"""Clean up any existing mooncake_master processes.
Args:
timeout_secs: Maximum seconds to wait for graceful termination.
"""
subprocess.run(
["pkill", "-f", "mooncake_master"],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
start_time = time.time()
while time.time() - start_time < timeout_secs:
result = subprocess.run(
["pgrep", "-f", "mooncake_master"],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
if result.returncode != 0:
break
time.sleep(0.5)
else:
subprocess.run(
["pkill", "-9", "-f", "mooncake_master"],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
time.sleep(1)
def _load_mooncake_config(host: str, rpc_port: int, http_port: int) -> str:
"""Load Mooncake config from YAML and substitute placeholders.
Args:
host: Mooncake host address.
rpc_port: RPC port for Mooncake master.
http_port: HTTP metadata server port.
Returns:
Path to the temporary config file with substituted values.
"""
config_path = str(Path(__file__).parent / "stage_configs" / "bagel_mooncake_ci.yaml")
with open(config_path) as f:
config_content = f.read()
# Substitute placeholders
config_content = config_content.replace("${MOONCAKE_HOST}", host)
config_content = config_content.replace("${MOONCAKE_RPC_PORT}", str(rpc_port))
config_content = config_content.replace("${MOONCAKE_HTTP_PORT}", str(http_port))
# Write to temp file
temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False)
temp_file.write(config_content)
temp_file.close()
return temp_file.name
@pytest.mark.core_model
@pytest.mark.diffusion
@hardware_test(res={"cuda": "H100"})
def test_bagel_text2img_mooncake_connector():
"""Test Bagel text2img with Mooncake connector for inter-stage communication."""
MOONCAKE_HOST = "127.0.0.1"
MOONCAKE_RPC_PORT = _find_free_port()
MOONCAKE_HTTP_PORT = _find_free_port()
MOONCAKE_METRICS_PORT = _find_free_port()
mooncake_master_proc = None
temp_config_file = None
omni = None
try:
_cleanup_mooncake_processes()
# Start mooncake_master
mooncake_master_proc = subprocess.Popen(
[
"mooncake_master",
f"--rpc_port={MOONCAKE_RPC_PORT}",
"--enable_http_metadata_server=true",
"--http_metadata_server_host=0.0.0.0",
f"--http_metadata_server_port={MOONCAKE_HTTP_PORT}",
f"--metrics_port={MOONCAKE_METRICS_PORT}",
],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
preexec_fn=os.setsid,
)
assert _wait_for_port(MOONCAKE_HOST, MOONCAKE_RPC_PORT), "mooncake_master failed to start"
# Create temp config and initialize Omni
temp_config_file = _load_mooncake_config(
host=MOONCAKE_HOST,
rpc_port=MOONCAKE_RPC_PORT,
http_port=MOONCAKE_HTTP_PORT,
)
omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=temp_config_file, stage_init_timeout=300)
generated_image = _generate_bagel_image(omni)
_validate_pixels(generated_image)
finally:
if omni:
omni.close()
if temp_config_file:
try:
os.unlink(temp_config_file)
except OSError:
pass
if mooncake_master_proc:
try:
os.killpg(os.getpgid(mooncake_master_proc.pid), signal.SIGKILL)
except OSError:
pass
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
System test for cache-dit backend.
This test verifies that cache-dit acceleration works correctly with diffusion models.
It uses minimal settings to keep test time short for CI.
"""
import os
import sys
from pathlib import Path
import pytest
import torch
from vllm_omni.inputs.data import OmniDiffusionSamplingParams
# ruff: noqa: E402
REPO_ROOT = Path(__file__).resolve().parents[2]
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))
from vllm_omni import Omni
from vllm_omni.outputs import OmniRequestOutput
os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1"
# Use random weights model for testing
models = ["riverclouds/qwen_image_random"]
@pytest.mark.parametrize("model_name", models)
def test_cache_dit(model_name: str):
"""Test cache-dit backend with diffusion model."""
# Configure cache-dit with minimal settings for fast testing
cache_config = {
"Fn_compute_blocks": 1,
"Bn_compute_blocks": 0,
"max_warmup_steps": 2, # Minimal warmup for fast test
"residual_diff_threshold": 0.24,
"max_continuous_cached_steps": 3,
}
m = None
try:
m = Omni(
model=model_name,
cache_backend="cache_dit",
cache_config=cache_config,
)
# Use minimal settings for fast testing
height = 256
width = 256
num_inference_steps = 4 # Minimal steps for fast test
outputs = m.generate(
"a photo of a cat sitting on a laptop keyboard",
OmniDiffusionSamplingParams(
height=height,
width=width,
num_inference_steps=num_inference_steps,
guidance_scale=0.0,
generator=torch.Generator("cuda").manual_seed(42),
num_outputs_per_prompt=1, # Single output for speed
),
)
# Extract images from request_output[0]['images']
first_output = outputs[0]
assert first_output.final_output_type == "image"
if not hasattr(first_output, "request_output") or not first_output.request_output:
raise ValueError("No request_output found in OmniRequestOutput")
req_out = first_output.request_output[0]
if not isinstance(req_out, OmniRequestOutput) or not hasattr(req_out, "images"):
raise ValueError("Invalid request_output structure or missing 'images' key")
images = req_out.images
# Verify generation succeeded
assert images is not None
assert len(images) == 1
# Check image size
assert images[0].width == width
assert images[0].height == height
except Exception as e:
print(f"Test failed with error: {e}")
raise
finally:
if m is not None and hasattr(m, "close"):
m.close()
import sys
from pathlib import Path
import pytest
import torch
from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
from tests.utils import GPUMemoryMonitor
from vllm_omni.inputs.data import OmniDiffusionSamplingParams
from vllm_omni.platforms import current_omni_platform
# ruff: noqa: E402
REPO_ROOT = Path(__file__).resolve().parents[2]
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))
from vllm_omni import Omni
models = ["riverclouds/qwen_image_random"]
def inference(model_name: str, offload: bool = True):
current_omni_platform.empty_cache()
device_index = torch.cuda.current_device()
monitor = GPUMemoryMonitor(device_index=device_index, interval=0.02)
monitor.start()
m = Omni(model=model_name, enable_cpu_offload=offload)
torch.cuda.reset_peak_memory_stats(device=device_index)
height = 256
width = 256
m.generate(
"a photo of a cat sitting on a laptop keyboard",
OmniDiffusionSamplingParams(
height=height,
width=width,
num_inference_steps=9,
guidance_scale=0.0,
generator=torch.Generator("cuda").manual_seed(42),
),
)
peak = monitor.peak_used_mb
monitor.stop()
return peak
@pytest.mark.skipif(current_omni_platform.is_npu() or current_omni_platform.is_rocm(), reason="Hardware not supported")
@pytest.mark.parametrize("model_name", models)
def test_cpu_offload_diffusion_model(model_name: str):
try:
no_offload_peak_memory = inference(model_name, offload=False)
cleanup_dist_env_and_memory()
offload_peak_memory = inference(model_name, offload=True)
except Exception:
pytest.fail("Inference failed")
print(f"Offload peak memory: {offload_peak_memory} MB")
print(f"No offload peak memory: {no_offload_peak_memory} MB")
assert offload_peak_memory + 2500 < no_offload_peak_memory, (
f"Offload peak memory {offload_peak_memory} MB should be less than no offload peak memory {no_offload_peak_memory} MB"
)
import sys
from pathlib import Path
import pytest
import torch
from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
from tests.utils import GPUMemoryMonitor
from vllm_omni.inputs.data import OmniDiffusionSamplingParams
from vllm_omni.platforms import current_omni_platform
# ruff: noqa: E402
REPO_ROOT = Path(__file__).resolve().parents[2]
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))
from vllm_omni import Omni
# Models to test and expected saved memory in MB, correspondingly
MODELS_SAVED_MEMORY_MB = {"riverclouds/qwen_image_random": 4500}
def run_inference(
model_name: str,
layerwise_offload: bool = False,
num_gpu_layers: int = 1,
num_inference_steps: int = 3,
) -> float:
# For now, only support on GPU, so apply torch.cuda operations here
# NPU / ROCm platforms are expected to be detected and skipped this test function
torch.cuda.empty_cache()
device_index = torch.cuda.current_device()
monitor = GPUMemoryMonitor(device_index=device_index, interval=0.02)
monitor.start()
m = Omni(
model=model_name,
enable_layerwise_offload=layerwise_offload,
layerwise_num_gpu_layers=num_gpu_layers,
boundary_ratio=0.875,
flow_shift=5.0,
)
torch.cuda.reset_peak_memory_stats(device=device_index)
# Refer to tests/e2e/offline_inference/test_t2v_model.py
# Use minimal settings for testing
height = 480
width = 640
num_frames = 5
m.generate(
"A cat sitting on a table",
OmniDiffusionSamplingParams(
height=height,
width=width,
generator=torch.Generator("cuda").manual_seed(42),
guidance_scale=1.0,
num_inference_steps=num_inference_steps,
num_frames=num_frames,
),
)
peak = monitor.peak_used_mb
monitor.stop()
return peak
@pytest.mark.skipif(current_omni_platform.is_npu() or current_omni_platform.is_rocm(), reason="Hardware not supported")
@pytest.mark.parametrize("model_name", MODELS_SAVED_MEMORY_MB.keys())
def test_layerwise_offload_diffusion_model(model_name: str):
"""Test that layerwise offloading reduces GPU memory usage.
This test verifies that layerwise offloading significantly reduces peak
GPU memory usage compared to loading the entire model on GPU. The layerwise
offloader keeps only a single transformer block on GPU at a time, with
prefetching for compute-memory overlap.
"""
try:
# Run without layerwise offloading (baseline)
no_offload_peak_memory = run_inference(model_name, layerwise_offload=False)
cleanup_dist_env_and_memory()
# Run with layerwise offloading (1 layer on device)
layerwise_offload_peak_memory = run_inference(model_name, layerwise_offload=True, num_gpu_layers=1)
cleanup_dist_env_and_memory()
# Run with 2 layers on device
layerwise_offload_two_layers_peak = run_inference(model_name, layerwise_offload=True, num_gpu_layers=2)
except Exception:
pytest.fail("Inference failed")
print(f"Layerwise offload peak memory (1 GPU layer): {layerwise_offload_peak_memory} MB")
print(f"Layerwise offload peak memory (2 GPU layers): {layerwise_offload_two_layers_peak} MB")
print(f"No offload peak memory: {no_offload_peak_memory} MB")
# Verify that layerwise offloading significantly reduces memory usage
# Passes only if the actual savings exceeds the expected savings
assert layerwise_offload_peak_memory + MODELS_SAVED_MEMORY_MB[model_name] < no_offload_peak_memory, (
f"Layerwise offload peak memory {layerwise_offload_peak_memory} MB "
f"should be significantly less than no offload peak memory {no_offload_peak_memory} MB"
)
# Verify that 2 GPU layers uses more memory than 1 GPU layer
# But not excessively more (should be a reasonable increase)
assert layerwise_offload_peak_memory < layerwise_offload_two_layers_peak, (
f"1 GPU layer peak {layerwise_offload_peak_memory} MB should be < "
f"2 GPU layers peak {layerwise_offload_two_layers_peak} MB"
)
import json
import os
import sys
from pathlib import Path
import pytest
import torch
from safetensors.torch import save_file
from vllm_omni.inputs.data import OmniDiffusionSamplingParams
from vllm_omni.outputs import OmniRequestOutput
# ruff: noqa: E402
REPO_ROOT = Path(__file__).resolve().parents[2]
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))
from vllm_omni import Omni
os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1"
# This test is specific to Z-Image LoRA behavior. Keep it focused on a single
# model to reduce runtime and avoid extra downloads.
models = ["Tongyi-MAI/Z-Image-Turbo"]
@pytest.mark.parametrize("model_name", models)
def test_diffusion_model(model_name: str, tmp_path: Path):
def _extract_images(outputs: list[OmniRequestOutput]):
if not outputs:
raise ValueError("Empty outputs from Omni.generate()")
first_output = outputs[0]
assert first_output.final_output_type == "image"
if not hasattr(first_output, "request_output") or not first_output.request_output:
raise ValueError("No request_output found in OmniRequestOutput")
req_out = first_output.request_output[0]
if not isinstance(req_out, OmniRequestOutput) or not hasattr(req_out, "images"):
raise ValueError("Invalid request_output structure or missing 'images' key")
return req_out.images
def _write_zimage_lora(adapter_dir: Path) -> str:
adapter_dir.mkdir(parents=True, exist_ok=True)
# Z-Image transformer uses dim=3840 by default (see ZImageTransformer2DModel).
dim = 3840
module_name = "transformer.layers.0.attention.to_qkv"
rank = 1
lora_a = torch.zeros((rank, dim), dtype=torch.float32)
lora_a[0, 0] = 1.0
# QKVParallelLinear packs (Q, K, V). With tp=1 and n_kv_heads==n_heads in Z-Image,
# each slice is `dim`, so total out dim is `3 * dim`.
lora_b = torch.zeros((3 * dim, rank), dtype=torch.float32)
# Apply a visible delta to the Q slice only to keep the perturbation bounded.
lora_b[:dim, 0] = 0.1
save_file(
{
f"base_model.model.{module_name}.lora_A.weight": lora_a,
f"base_model.model.{module_name}.lora_B.weight": lora_b,
},
str(adapter_dir / "adapter_model.safetensors"),
)
(adapter_dir / "adapter_config.json").write_text(
json.dumps(
{
"r": rank,
"lora_alpha": rank,
"target_modules": [module_name],
}
),
encoding="utf-8",
)
return str(adapter_dir)
m = Omni(model=model_name)
try:
# high resolution may cause OOM on L4
height = 256
width = 256
prompt = "a photo of a cat sitting on a laptop keyboard"
outputs = m.generate(
prompt,
OmniDiffusionSamplingParams(
height=height,
width=width,
num_inference_steps=2,
guidance_scale=0.0,
generator=torch.Generator("cuda").manual_seed(42),
num_outputs_per_prompt=1,
),
)
images = _extract_images(outputs)
assert len(images) == 1
# check image size
assert images[0].width == width
assert images[0].height == height
# Real LoRA E2E: generate again with a real on-disk PEFT adapter and
# verify that output changes.
if model_name == "Tongyi-MAI/Z-Image-Turbo":
from vllm_omni.lora.request import LoRARequest
from vllm_omni.lora.utils import stable_lora_int_id
lora_dir = _write_zimage_lora(tmp_path / "zimage_lora")
lora_request = LoRARequest(
lora_name="test",
lora_int_id=stable_lora_int_id(lora_dir),
lora_path=lora_dir,
)
outputs_lora = m.generate(
prompt,
OmniDiffusionSamplingParams(
height=height,
width=width,
num_inference_steps=2,
guidance_scale=0.0,
generator=torch.Generator("cuda").manual_seed(42),
num_outputs_per_prompt=1,
lora_request=lora_request,
lora_scale=2.0,
),
)
images_lora = _extract_images(outputs_lora)
assert len(images_lora) == 1
assert images_lora[0].width == width
assert images_lora[0].height == height
import numpy as np
diff = np.abs(np.array(images[0], dtype=np.int16) - np.array(images_lora[0], dtype=np.int16)).mean()
assert diff > 0.0
finally:
m.close()
"""
Tests for Ovis Image model pipeline.
Strategy:
1. `mock_dependencies` fixture mocks heavy external components (VAE, Scheduler, TextEncoder)
to allow fast testing of the pipeline logic without downloading weights.
- Mocks are configured to return tensors on the correct device.
- Transformer is mocked dynamically to return random noise of correct shape.
2. `test_real_transformer_init_and_forward` tests the actual `OvisImageTransformer2DModel`
initialization and forward pass with a small configuration to ensure code coverage
and correctness of the model definition itself, independent of the pipeline mocks.
"""
from unittest.mock import MagicMock, patch
import pytest
import torch
from vllm_omni.diffusion.data import OmniDiffusionConfig, TransformerConfig
# Mock the OvisImageTransformer2DModel to avoid complex init if needed,
# or let it run if it's lightweight. It's likely not lightweight.
# Better to mock the transformer forwarding to return random noise.
from vllm_omni.diffusion.distributed.utils import get_local_device
from vllm_omni.diffusion.models.ovis_image.pipeline_ovis_image import OvisImagePipeline
from vllm_omni.diffusion.request import OmniDiffusionRequest
from vllm_omni.inputs.data import OmniDiffusionSamplingParams
@pytest.fixture
def mock_dependencies(monkeypatch):
"""
Mock external dependencies to avoid loading real models.
"""
device = get_local_device()
# Mock Tokenizer
mock_tokenizer = MagicMock()
mock_tokenizer.return_value = MagicMock(
input_ids=torch.zeros((1, 50), dtype=torch.long, device=device),
attention_mask=torch.ones((1, 50), dtype=torch.long, device=device),
)
mock_tokenizer.apply_chat_template.return_value = "dummy prompt"
mock_tokenizer.model_max_length = 1024
# Mock Text Encoder
mock_text_encoder = MagicMock()
mock_text_encoder.dtype = torch.float32
# Output of text encoder must be on the same device as inputs (which are moved to execution_device)
mock_text_encoder.return_value.last_hidden_state = torch.randn(1, 50, 32, device=device)
# Mock VAE
mock_vae = MagicMock()
mock_vae.config.block_out_channels = [128, 256, 512, 512] # Scale factor 8
mock_vae.config.scale_factor_temporal = 1
mock_vae.config.scale_factor_spatial = 8
mock_vae.config.scaling_factor = 0.18215
mock_vae.config.shift_factor = 0.0
# Decode return value
mock_vae.decode.return_value = [torch.randn(1, 3, 128, 128, device=device)]
# Ensure .to() returns self so configuration persists
mock_vae.to.return_value = mock_vae
# Mock Scheduler
mock_scheduler = MagicMock()
mock_scheduler.config = MagicMock()
# Timesteps on device to match latents during denoising loop interaction if needed
mock_scheduler.timesteps = torch.tensor([1.0, 0.5, 0.0], device=device)
mock_scheduler.set_timesteps.return_value = None
# Make step return dynamic based on input sample shape
def mock_scheduler_step(model_output, timestep, sample, **kwargs):
# sample is the latents, should be preserved
return (torch.randn_like(sample),)
mock_scheduler.step.side_effect = mock_scheduler_step
module_path = "vllm_omni.diffusion.models.ovis_image.pipeline_ovis_image"
monkeypatch.setattr(f"{module_path}.Qwen2TokenizerFast.from_pretrained", lambda *a, **k: mock_tokenizer)
monkeypatch.setattr(f"{module_path}.Qwen3Model.from_pretrained", lambda *a, **k: mock_text_encoder)
monkeypatch.setattr(f"{module_path}.AutoencoderKL.from_pretrained", lambda *a, **k: mock_vae)
monkeypatch.setattr(
f"{module_path}.FlowMatchEulerDiscreteScheduler.from_pretrained", lambda *a, **k: mock_scheduler
)
return {
"tokenizer": mock_tokenizer,
"text_encoder": mock_text_encoder,
"vae": mock_vae,
"scheduler": mock_scheduler,
"device": device,
}
@pytest.fixture
def ovis_pipeline(mock_dependencies, monkeypatch):
"""
Creates an OvisImagePipeline instance with mocked components.
"""
# Create config
tf_config = TransformerConfig(
params={
"in_channels": 4,
"out_channels": 4,
"sample_size": 32,
"patch_size": 2,
"num_attention_heads": 4,
"attention_head_dim": 8,
"num_layers": 1,
"caption_channels": 32,
}
)
od_config = OmniDiffusionConfig(
model="dummy-ovis",
tf_model_config=tf_config,
dtype=torch.float32,
num_gpus=1,
)
# Mock Transformer Layer separately to avoid full init
# We patch OvisImageTransformer2DModel class in the module
mock_transformer_cls = MagicMock()
mock_transformer_instance = MagicMock()
mock_transformer_instance.dtype = torch.float32
mock_transformer_instance.in_channels = 16 # Must be 16 so num_channel_latents=4, packed=16
# Forward return: noise prediction
def mock_forward(hidden_states, *args, **kwargs):
# hidden_states shape: (B, SeqLen, Channels)
return (torch.randn_like(hidden_states),)
mock_transformer_instance.forward.side_effect = mock_forward
# Also make the instance itself callable to mimic __call__
mock_transformer_instance.side_effect = mock_forward
mock_transformer_cls.return_value = mock_transformer_instance
monkeypatch.setattr(
"vllm_omni.diffusion.models.ovis_image.pipeline_ovis_image.OvisImageTransformer2DModel", mock_transformer_cls
)
# Initialize pipeline
# We use a dummy model path check override
with patch("os.path.exists", return_value=True):
pipeline = OvisImagePipeline(od_config=od_config)
return pipeline
def test_interface_compliance(ovis_pipeline):
"""Verify methods required by vllm-omni framework."""
assert hasattr(ovis_pipeline, "load_weights")
assert hasattr(ovis_pipeline, "scheduler")
assert hasattr(ovis_pipeline, "transformer")
assert hasattr(ovis_pipeline, "text_encoder")
# assert hasattr(ovis_pipeline, "vae") # Ovis uses VAE
def test_basic_generation(ovis_pipeline):
"""Test the forward pass logic."""
# Setup request
req = OmniDiffusionRequest(
prompts=["A photo of a cat"],
sampling_params=OmniDiffusionSamplingParams(
height=256,
width=256,
num_inference_steps=2,
guidance_scale=1.0,
),
)
output = ovis_pipeline(req)
assert output is not None
assert output.output is not None
# Output should be a tensor from mocked VAE decode [torch.randn(1, 3, 128, 128)]
assert isinstance(output.output, torch.Tensor)
assert output.output.shape == (1, 3, 128, 128)
# Check that transformer was called
assert ovis_pipeline.transformer.call_count > 0
def test_guidance_scale(ovis_pipeline):
"""Test that classifier-free guidance path is taken when scale > 1.0."""
req = OmniDiffusionRequest(
prompts=[
{
"prompt": "A photo of a cat",
"negative_prompt": "bad quality",
}
],
sampling_params=OmniDiffusionSamplingParams(
height=256,
width=256,
num_inference_steps=1,
guidance_scale=2.0, # Trigger CFG
),
)
ovis_pipeline(req)
assert ovis_pipeline.transformer.call_count >= 2
def test_resolution_check(ovis_pipeline):
"""Test resolution divisible validation logic if present."""
# Pass odd resolution
req = OmniDiffusionRequest(
prompts=["test"],
sampling_params=OmniDiffusionSamplingParams(
height=250, # Not divisible by 16 (8*2)
width=250,
),
)
# Should warn but proceed (as per code I read earlier) or resize?
# The code had `logger.warning(...)`
output = ovis_pipeline(req)
assert output is not None
def test_real_transformer_init_and_forward():
"""Test the real OvisImageTransformer2DModel initialization and forward pass for coverage."""
from unittest.mock import patch
from vllm_omni.diffusion.models.ovis_image.ovis_image_transformer import OvisImageTransformer2DModel
device = get_local_device()
tf_config = TransformerConfig(
params={
"patch_size": 2,
"in_channels": 16,
"out_channels": 16,
"num_layers": 1,
"num_single_layers": 1,
"attention_head_dim": 8,
"num_attention_heads": 2,
"joint_attention_dim": 32,
"axes_dims_rope": (4, 4, 4),
}
)
od_config = OmniDiffusionConfig(model="dummy-ovis", tf_model_config=tf_config, dtype=torch.bfloat16, num_gpus=1)
torch.set_default_dtype(torch.bfloat16)
# Mock distributed state for QKVParallelLinear initialization
# We patch get_tp_group because get_tensor_model_parallel_rank calls it and asserts _TP is not None
mock_group = MagicMock()
mock_group.rank_in_group = 0
mock_group.world_size = 1
with patch("vllm.distributed.parallel_state.get_tp_group", return_value=mock_group):
# Initialize real model
model = OvisImageTransformer2DModel(
od_config=od_config,
patch_size=1,
in_channels=16,
out_channels=16,
num_single_layers=1,
attention_head_dim=8,
num_attention_heads=2,
joint_attention_dim=32,
axes_dims_rope=(2, 2, 4),
).to(device)
# Create dummy inputs
B, Seq, C = 1, 16, 16
hidden_states = torch.randn(B, Seq, C, device=device)
encoder_hidden_states = torch.randn(B, 10, 32, device=device) # joint_attention_dim=32
timestep = torch.tensor([1], device=device)
img_ids = torch.zeros(Seq, 3, device=device)
txt_ids = torch.zeros(10, 3, device=device)
# Run forward
output = model(
hidden_states=hidden_states,
encoder_hidden_states=encoder_hidden_states,
timestep=timestep,
img_ids=img_ids,
txt_ids=txt_ids,
return_dict=False,
)
assert output is not None
assert isinstance(output, tuple)
assert output[0].shape == hidden_states.shape
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
E2E tests for Qwen2.5-Omni model with mixed modality inputs and audio output.
"""
from pathlib import Path
import pytest
from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.envs import VLLM_USE_MODELSCOPE
from vllm.multimodal.image import convert_image_mode
from vllm_omni.platforms import current_omni_platform
from .conftest import OmniRunner
from .utils import create_new_process_for_each_test
models = ["Qwen/Qwen2.5-Omni-3B"]
# CI stage config optimized for 24GB GPU (L4/RTX3090) or NPU
if current_omni_platform.is_npu():
stage_config = str(Path(__file__).parent / "stage_configs" / "npu" / "qwen2_5_omni_ci.yaml")
elif current_omni_platform.is_rocm():
# ROCm stage config optimized for MI325 GPU
stage_config = str(Path(__file__).parent / "stage_configs" / "rocm" / "qwen2_5_omni_ci.yaml")
else:
stage_config = str(Path(__file__).parent / "stage_configs" / "qwen2_5_omni_ci.yaml")
# Create parameter combinations for model and stage config
test_params = [(model, stage_config) for model in models]
@pytest.mark.core_model
@pytest.mark.parametrize("test_config", test_params)
@create_new_process_for_each_test("spawn")
def test_mixed_modalities_to_audio(omni_runner: type[OmniRunner], test_config: tuple[str, str]) -> None:
"""Test processing audio, image, and video together, generating audio output."""
model, stage_config_path = test_config
with omni_runner(model, seed=42, stage_configs_path=stage_config_path) as runner:
# Prepare multimodal inputs
question = "What is recited in the audio? What is in this image? Describe the video briefly."
audio = AudioAsset("mary_had_lamb").audio_and_sample_rate
audio = (audio[0][: 16000 * 5], audio[1]) # Trim to first 5 seconds
image = convert_image_mode(ImageAsset("cherry_blossom").pil_image.resize((128, 128)), "RGB")
if not VLLM_USE_MODELSCOPE:
video = VideoAsset(name="baby_reading", num_frames=4).np_ndarrays
else:
# modelscope can't access raushan-testing-hf/videos-test, skip video input temporarily
video = None
outputs = runner.generate_multimodal(
prompts=question,
audios=audio,
images=image,
videos=video,
)
# Find and verify text output (thinker stage)
text_output = None
output_count = 0
for stage_output in outputs:
if stage_output.final_output_type == "text":
text_output = stage_output
output_count += 1
break
assert output_count > 0
assert text_output is not None
assert len(text_output.request_output) > 0
text_content = text_output.request_output[0].outputs[0].text
assert text_content is not None
assert len(text_content.strip()) > 0
# Find and verify audio output (code2wav stage)
audio_output = None
output_count = 0
for stage_output in outputs:
if stage_output.final_output_type == "audio":
audio_output = stage_output
output_count += 1
break
assert output_count > 0
assert audio_output is not None
assert len(audio_output.request_output) > 0
# Verify audio tensor exists and has content
audio_tensor = audio_output.request_output[0].outputs[0].multimodal_output["audio"]
assert audio_tensor is not None
assert audio_tensor.numel() > 0
@pytest.mark.core_model
@pytest.mark.parametrize("test_config", test_params)
@create_new_process_for_each_test("spawn")
def test_mixed_modalities_to_text_only(omni_runner: type[OmniRunner], test_config: tuple[str, str]) -> None:
"""Test processing audio, image, and video together, generating audio output."""
model, stage_config_path = test_config
with omni_runner(model, seed=42, stage_configs_path=stage_config_path) as runner:
# Prepare multimodal inputs
question = "What is recited in the audio? What is in this image? Describe the video briefly."
audio = AudioAsset("mary_had_lamb").audio_and_sample_rate
audio = (audio[0][: 16000 * 5], audio[1]) # Trim to first 5 seconds
image = convert_image_mode(ImageAsset("cherry_blossom").pil_image.resize((128, 128)), "RGB")
video = VideoAsset(name="baby_reading", num_frames=4).np_ndarrays
modalities = ["text"]
outputs = runner.generate_multimodal(
prompts=question,
audios=audio,
images=image,
videos=video,
modalities=modalities,
)
# Find and verify text output (thinker stage)
text_output = None
output_count = 0
for stage_output in outputs:
assert stage_output.final_output_type != "audio"
if stage_output.final_output_type == "text":
text_output = stage_output
output_count += 1
break
assert output_count > 0
assert text_output is not None
assert len(text_output.request_output) > 0
text_content = text_output.request_output[0].outputs[0].text
assert text_content is not None
assert len(text_content.strip()) > 0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
E2E offline tests for Omni model with video input and audio output.
"""
import os
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0"
from pathlib import Path
import pytest
from vllm.assets.video import VideoAsset
from vllm_omni.platforms import current_omni_platform
from .conftest import OmniRunner
models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"]
# CI stage config for 2xH100-80G GPUs or AMD GPU MI325
if current_omni_platform.is_rocm():
# ROCm stage config optimized for MI325 GPU
stage_configs = [str(Path(__file__).parent / "stage_configs" / "rocm" / "qwen3_omni_ci.yaml")]
else:
stage_configs = [str(Path(__file__).parent / "stage_configs" / "qwen3_omni_ci.yaml")]
# Create parameter combinations for model and stage config
test_params = [(model, stage_config) for model in models for stage_config in stage_configs]
@pytest.mark.parametrize("test_config", test_params)
def test_video_to_audio(omni_runner: type[OmniRunner], test_config) -> None:
"""Test processing video, generating audio output."""
model, stage_config_path = test_config
with omni_runner(model, seed=42, stage_configs_path=stage_config_path, stage_init_timeout=300) as runner:
# Prepare inputs
question = "Describe the video briefly."
video = VideoAsset(name="baby_reading", num_frames=4).np_ndarrays
outputs = runner.generate_multimodal(
prompts=question,
videos=video,
)
# Find and verify text output (thinker stage)
text_output = None
output_count = 0
for stage_output in outputs:
if stage_output.final_output_type == "text":
text_output = stage_output
output_count += 1
break
assert output_count > 0
assert text_output is not None
assert len(text_output.request_output) > 0
text_content = text_output.request_output[0].outputs[0].text
assert text_content is not None
assert len(text_content.strip()) > 0
# Find and verify audio output (code2wav stage)
audio_output = None
output_count = 0
for stage_output in outputs:
if stage_output.final_output_type == "audio":
audio_output = stage_output
output_count += 1
break
assert output_count > 0
assert audio_output is not None
assert len(audio_output.request_output) > 0
# Verify audio tensor exists and has content
audio_tensor = audio_output.request_output[0].outputs[0].multimodal_output["audio"]
assert audio_tensor is not None
assert audio_tensor.numel() > 0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
System test for Sequence Parallel (SP) backends: Ulysses and Ring attention.
Tests verify that SP inference produces correct outputs compared to baseline.
"""
import gc
import os
import sys
import time
from pathlib import Path
from typing import NamedTuple
import numpy as np
import pytest
import torch
import torch.distributed as dist
from PIL import Image
from vllm_omni.inputs.data import OmniDiffusionSamplingParams
# ruff: noqa: E402
REPO_ROOT = Path(__file__).resolve().parents[3]
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))
from vllm_omni import Omni
from vllm_omni.diffusion.data import DiffusionParallelConfig
from vllm_omni.platforms import current_omni_platform
# Test configuration
MODELS = ["riverclouds/qwen_image_random"]
PROMPT = "a photo of a cat sitting on a laptop keyboard"
DEFAULT_HEIGHT = 256
DEFAULT_WIDTH = 256
DEFAULT_SEED = 42
DEFAULT_STEPS = 4
DIFF_MEAN_THRESHOLD = 2e-2
DIFF_MAX_THRESHOLD = 2e-1
class InferenceResult(NamedTuple):
"""Result of an inference run."""
images: list[Image.Image]
elapsed_ms: float
def _cleanup_distributed():
"""Clean up distributed environment and GPU resources."""
if dist.is_initialized():
dist.destroy_process_group()
for key in ["MASTER_ADDR", "MASTER_PORT", "RANK", "WORLD_SIZE", "LOCAL_RANK"]:
os.environ.pop(key, None)
gc.collect()
if current_omni_platform.is_available():
current_omni_platform.empty_cache()
current_omni_platform.synchronize()
time.sleep(5)
def _diff_metrics(a: Image.Image, b: Image.Image) -> tuple[float, float]:
"""Return (mean_abs_diff, max_abs_diff) over RGB pixels in [0, 1]."""
ta = torch.from_numpy(np.asarray(a.convert("RGB"), dtype=np.float32) / 255.0)
tb = torch.from_numpy(np.asarray(b.convert("RGB"), dtype=np.float32) / 255.0)
assert ta.shape == tb.shape, f"Image shapes differ: {ta.shape} vs {tb.shape}"
abs_diff = torch.abs(ta - tb)
return abs_diff.mean().item(), abs_diff.max().item()
def _run_inference(
model_name: str,
dtype: torch.dtype,
attn_backend: str,
ulysses_degree: int = 1,
ring_degree: int = 1,
height: int = DEFAULT_HEIGHT,
width: int = DEFAULT_WIDTH,
seed: int = DEFAULT_SEED,
warmup: bool = True,
) -> InferenceResult:
"""Run inference with specified configuration.
Args:
warmup: If True, run one warmup iteration before the timed run.
"""
parallel_config = DiffusionParallelConfig(ulysses_degree=ulysses_degree, ring_degree=ring_degree)
omni = Omni(
model=model_name,
parallel_config=parallel_config,
dtype=dtype,
attention_backend=attn_backend,
)
try:
# Warmup run (not timed)
if warmup:
_ = omni.generate(
PROMPT,
OmniDiffusionSamplingParams(
height=height,
width=width,
num_inference_steps=DEFAULT_STEPS,
guidance_scale=0.0,
generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed + 1000),
num_outputs_per_prompt=1,
),
)
# Timed run
start = time.time()
outputs = omni.generate(
PROMPT,
OmniDiffusionSamplingParams(
height=height,
width=width,
num_inference_steps=DEFAULT_STEPS,
guidance_scale=0.0,
generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed),
num_outputs_per_prompt=1,
),
)
elapsed_ms = (time.time() - start) * 1000
return InferenceResult(
images=outputs[0].request_output[0].images,
elapsed_ms=elapsed_ms,
)
finally:
omni.close()
_cleanup_distributed()
# =============================================================================
# Correctness & Performance Tests
# =============================================================================
# SP configurations: (ulysses_degree, ring_degree, height, width, warmup, is_perf_test)
# - warmup: whether to run warmup for this SP config
# - is_perf_test: whether this is a performance test (show speedup metrics)
SP_CONFIGS = [
(2, 1, DEFAULT_HEIGHT, DEFAULT_WIDTH, True, True), # Ulysses-2 - performance test
(1, 2, DEFAULT_HEIGHT, DEFAULT_WIDTH, True, True), # Ring-2 - performance test
(2, 2, DEFAULT_HEIGHT, DEFAULT_WIDTH, False, False), # Hybrid - correctness only
(4, 1, 272, 272, False, False), # Ulysses-4 - shape and correctness
]
def _get_sp_mode(ulysses_degree: int, ring_degree: int) -> str:
"""Get SP mode name for logging."""
if ulysses_degree > 1 and ring_degree == 1:
return f"ulysses-{ulysses_degree}"
elif ring_degree > 1 and ulysses_degree == 1:
return f"ring-{ring_degree}"
else:
return f"hybrid-{ulysses_degree}x{ring_degree}"
@pytest.mark.parametrize("model_name", MODELS)
def test_sp_correctness(model_name: str):
"""Test that SP inference produces correct outputs and measure performance.
Runs baseline once per unique (height, width), then tests all SP configs.
Note: Run with `pytest -v -s` to see detailed output.
"""
device_count = current_omni_platform.get_device_count()
# Cache baseline results by (height, width)
# Key: (height, width), Value: (result, warmup_used)
baseline_cache: dict[tuple[int, int], InferenceResult] = {}
# Collect results for summary
results: list[dict] = []
print("\n" + "=" * 70)
print(f"Sequence Parallel Test - Model: {model_name}")
print(f"Available GPUs: {device_count}")
print("=" * 70)
for ulysses_degree, ring_degree, height, width, sp_warmup, is_perf_test in SP_CONFIGS:
sp_size = ulysses_degree * ring_degree
sp_mode = _get_sp_mode(ulysses_degree, ring_degree)
if device_count < sp_size:
print(f"\n[{sp_mode}] SKIPPED (requires {sp_size} GPUs)")
continue
# Determine baseline warmup: only for default size (performance tests)
cache_key = (height, width)
baseline_warmup = height == DEFAULT_HEIGHT and width == DEFAULT_WIDTH
# Get or compute baseline for this (height, width)
if cache_key not in baseline_cache:
print(f"\n--- Running baseline {height}x{width} (warmup={baseline_warmup}) ---")
baseline = _run_inference(
model_name,
torch.bfloat16,
"sdpa",
height=height,
width=width,
warmup=baseline_warmup,
)
assert len(baseline.images) == 1
baseline_cache[cache_key] = baseline
print(f"[baseline] {height}x{width}: {baseline.elapsed_ms:.0f}ms")
else:
baseline = baseline_cache[cache_key]
# Run SP
print(f"\n--- Running {sp_mode} (warmup={sp_warmup}) ---")
sp_result = _run_inference(
model_name,
torch.bfloat16,
"sdpa",
ulysses_degree=ulysses_degree,
ring_degree=ring_degree,
height=height,
width=width,
warmup=sp_warmup,
)
assert len(sp_result.images) == 1
# Compare outputs (correctness)
mean_diff, max_diff = _diff_metrics(baseline.images[0], sp_result.images[0])
# Build result entry
result = {
"mode": sp_mode,
"sp_size": sp_size,
"height": height,
"width": width,
"baseline_ms": baseline.elapsed_ms,
"sp_ms": sp_result.elapsed_ms,
"mean_diff": mean_diff,
"max_diff": max_diff,
"is_perf_test": is_perf_test,
}
results.append(result)
# Output based on test type
if is_perf_test:
speedup = baseline.elapsed_ms / sp_result.elapsed_ms if sp_result.elapsed_ms > 0 else 0
result["speedup"] = speedup
print(
f"[{sp_mode}] {sp_size} GPUs | "
f"baseline: {baseline.elapsed_ms:.0f}ms, sp: {sp_result.elapsed_ms:.0f}ms, "
f"speedup: {speedup:.2f}x"
)
else:
print(f"[{sp_mode}] {sp_size} GPUs | sp: {sp_result.elapsed_ms:.0f}ms (correctness only)")
print(f"[{sp_mode}] diff: mean={mean_diff:.6e}, max={max_diff:.6e}")
# Assert correctness
assert mean_diff <= DIFF_MEAN_THRESHOLD and max_diff <= DIFF_MAX_THRESHOLD, (
f"[{sp_mode}] SP output differs from baseline: mean={mean_diff:.6e}, max={max_diff:.6e}"
)
# Summary
print("\n" + "=" * 70)
print("SUMMARY")
print("=" * 70)
print(f"{'Mode':<15} {'GPUs':<6} {'Size':<10} {'Baseline':<12} {'SP':<12} {'Speedup':<10} {'Status'}")
print("-" * 70)
for r in results:
speedup_str = f"{r['speedup']:.2f}x" if r.get("speedup") else "N/A"
baseline_str = f"{r['baseline_ms']:.0f}ms" if r["is_perf_test"] else "N/A"
status = "PASS" if r["mean_diff"] <= DIFF_MEAN_THRESHOLD else "FAIL"
print(
f"{r['mode']:<15} {r['sp_size']:<6} {r['height']}x{r['width']:<5} "
f"{baseline_str:<12} {r['sp_ms']:.0f}ms{'':<7} {speedup_str:<10} {status}"
)
print("=" * 70)
import sys
from pathlib import Path
import numpy as np
import pytest
import torch
from vllm_omni.inputs.data import OmniDiffusionSamplingParams
from vllm_omni.outputs import OmniRequestOutput
# ruff: noqa: E402
REPO_ROOT = Path(__file__).resolve().parents[2]
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))
from vllm_omni import Omni
# Use random weights model for CI testing (small, no authentication required)
models = ["linyueqian/stable_audio_random"]
@pytest.mark.parametrize("model_name", models)
def test_stable_audio_model(model_name: str):
m = Omni(model=model_name)
# Use minimal settings for testing
# Generate a short 2-second audio clip with minimal inference steps
audio_start_in_s = 0.0
audio_end_in_s = 2.0 # Short duration for fast testing
sample_rate = 44100 # Stable Audio uses 44100 Hz
outputs = m.generate(
prompts={
"prompt": "The sound of a dog barking",
"negative_prompt": "Low quality.",
},
sampling_params_list=OmniDiffusionSamplingParams(
num_inference_steps=4, # Minimal steps for speed
guidance_scale=7.0,
generator=torch.Generator("cuda").manual_seed(42),
num_outputs_per_prompt=1,
extra_args={
"audio_start_in_s": audio_start_in_s,
"audio_end_in_s": audio_end_in_s,
},
),
)
# Extract audio from OmniRequestOutput
assert outputs is not None
first_output = outputs[0]
assert first_output.final_output_type == "image"
assert hasattr(first_output, "request_output") and first_output.request_output
req_out = first_output.request_output[0]
assert isinstance(req_out, OmniRequestOutput)
assert req_out.final_output_type == "audio"
assert hasattr(req_out, "multimodal_output") and req_out.multimodal_output
audio = req_out.multimodal_output.get("audio")
assert isinstance(audio, np.ndarray)
# audio shape: (batch, channels, samples)
# For stable-audio-open-1.0: sample_rate=44100, so 2 seconds = 88200 samples
assert audio.ndim == 3
assert audio.shape[0] == 1 # batch size
assert audio.shape[1] == 2 # stereo channels
expected_samples = int((audio_end_in_s - audio_start_in_s) * sample_rate)
assert audio.shape[2] == expected_samples # 88200 samples for 2 seconds
import os
import sys
from pathlib import Path
import pytest
import torch
from vllm_omni.inputs.data import OmniDiffusionSamplingParams
from vllm_omni.outputs import OmniRequestOutput
from vllm_omni.platforms import current_omni_platform
# ruff: noqa: E402
REPO_ROOT = Path(__file__).resolve().parents[2]
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))
from vllm_omni import Omni
os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1"
models = ["Tongyi-MAI/Z-Image-Turbo", "riverclouds/qwen_image_random"]
# Modelscope can't find riverclouds/qwen_image_random
# TODO: When NPU support is ready, remove this branch.
if current_omni_platform.is_npu():
models = ["Tongyi-MAI/Z-Image-Turbo", "Qwen/Qwen-Image"]
elif current_omni_platform.is_rocm():
# TODO: When ROCm support is ready, remove this branch.
# vLLM V0.11.0 has issues running riverclouds/qwen_image_random
# on ROCm
models = ["Tongyi-MAI/Z-Image-Turbo"]
@pytest.mark.parametrize("model_name", models)
def test_diffusion_model(model_name: str):
m = None
try:
m = Omni(model=model_name)
# high resolution may cause OOM on L4
height = 256
width = 256
outputs = m.generate(
"a photo of a cat sitting on a laptop keyboard",
OmniDiffusionSamplingParams(
height=height,
width=width,
num_inference_steps=2,
guidance_scale=0.0,
generator=torch.Generator("cuda").manual_seed(42),
num_outputs_per_prompt=2,
),
)
# Extract images from request_output[0]['images']
first_output = outputs[0]
assert first_output.final_output_type == "image"
if not hasattr(first_output, "request_output") or not first_output.request_output:
raise ValueError("No request_output found in OmniRequestOutput")
req_out = first_output.request_output[0]
if not isinstance(req_out, OmniRequestOutput) or not hasattr(req_out, "images"):
raise ValueError("Invalid request_output structure or missing 'images' key")
images = req_out.images
assert len(images) == 2
# check image size
assert images[0].width == width
assert images[0].height == height
images[0].save("image_output.png")
except Exception as e:
print(f"Test failed with error: {e}")
raise
finally:
if m is not None and hasattr(m, "close"):
m.close()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment