vllm-omni_0.15.0.rc1+fix1 first commit

c1cacde6 · weishb · 35607782 · c1cacde6 · c1cacde6 · c1cacde6
Commit c1cacde6 authored Mar 25, 2026 by weishb
20 changed files
--- a/tests/e2e/offline_inference/__init__.py
+++ b/tests/e2e/offline_inference/__init__.py
--- a/tests/e2e/offline_inference/conftest.py
+++ b/tests/e2e/offline_inference/conftest.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Pytest configuration and fixtures for vllm-omni tests.
+"""
+
+from typing import Any
+
+import pytest
+from vllm import TextPrompt
+from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
+
+from tests.conftest import _run_post_test_cleanup, _run_pre_test_cleanup
+from vllm_omni.entrypoints.omni import Omni
+from vllm_omni.inputs.data import OmniSamplingParams
+from vllm_omni.outputs import OmniRequestOutput
+
+PromptAudioInput = list[tuple[Any, int]] | tuple[Any, int] | None
+PromptImageInput = list[Any] | Any | None
+PromptVideoInput = list[Any] | Any | None
+
+
+class OmniRunner:
+    """
+    Test runner for Omni models.
+    """
+
+    def __init__(
+        self,
+        model_name: str,
+        seed: int = 42,
+        stage_init_timeout: int = 300,
+        batch_timeout: int = 10,
+        init_timeout: int = 300,
+        shm_threshold_bytes: int = 65536,
+        log_stats: bool = False,
+        stage_configs_path: str | None = None,
+        **kwargs,
+    ) -> None:
+        """
+        Initialize an OmniRunner for testing.
+
+        Args:
+            model_name: The model name or path
+            seed: Random seed for reproducibility
+            stage_init_timeout: Timeout for initializing a single stage in seconds
+            batch_timeout: Timeout for batching in seconds
+            init_timeout: Timeout for initializing stages in seconds
+            shm_threshold_bytes: Threshold for using shared memory
+            log_stats: Enable detailed statistics logging
+            stage_configs_path: Optional path to YAML stage config file
+            **kwargs: Additional arguments passed to Omni
+        """
+        cleanup_dist_env_and_memory()
+        _run_pre_test_cleanup(enable_force=True)
+        _run_post_test_cleanup(enable_force=True)
+        self.model_name = model_name
+        self.seed = seed
+
+        self.omni = Omni(
+            model=model_name,
+            log_stats=log_stats,
+            stage_init_timeout=stage_init_timeout,
+            batch_timeout=batch_timeout,
+            init_timeout=init_timeout,
+            shm_threshold_bytes=shm_threshold_bytes,
+            stage_configs_path=stage_configs_path,
+            **kwargs,
+        )
+
+    def get_default_sampling_params_list(self) -> list[OmniSamplingParams]:
+        """
+        Get a list of default sampling parameters for all stages.
+
+        Returns:
+            List of SamplingParams with default decoding for each stage
+        """
+        return [st.default_sampling_params for st in self.omni.stage_list]
+
+    def get_omni_inputs(
+        self,
+        prompts: list[str] | str,
+        system_prompt: str | None = None,
+        audios: PromptAudioInput = None,
+        images: PromptImageInput = None,
+        videos: PromptVideoInput = None,
+        mm_processor_kwargs: dict[str, Any] | None = None,
+        modalities: list[str] | None = None,
+    ) -> list[TextPrompt]:
+        """
+        Construct Omni input format from prompts and multimodal data.
+
+        Args:
+            prompts: Text prompt(s) - either a single string or list of strings
+            system_prompt: Optional system prompt (defaults to Qwen system prompt)
+            audios: Audio input(s) - tuple of (audio_array, sample_rate) or list of tuples
+            images: Image input(s) - PIL Image or list of PIL Images
+            videos: Video input(s) - numpy array or list of numpy arrays
+            mm_processor_kwargs: Optional processor kwargs (e.g., use_audio_in_video)
+
+        Returns:
+            List of prompt dictionaries suitable for Omni.generate()
+        """
+        if system_prompt is None:
+            system_prompt = (
+                "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
+                "Group, capable of perceiving auditory and visual inputs, as well as "
+                "generating text and speech."
+            )
+
+        video_padding_token = "<|VIDEO|>"
+        image_padding_token = "<|IMAGE|>"
+        audio_padding_token = "<|AUDIO|>"
+
+        if self.model_name == "Qwen/Qwen3-Omni-30B-A3B-Instruct":
+            video_padding_token = "<|video_pad|>"
+            image_padding_token = "<|image_pad|>"
+            audio_padding_token = "<|audio_pad|>"
+
+        if isinstance(prompts, str):
+            prompts = [prompts]
+
+        def _normalize_mm_input(mm_input, num_prompts):
+            if mm_input is None:
+                return [None] * num_prompts
+            if isinstance(mm_input, list):
+                if len(mm_input) != num_prompts:
+                    raise ValueError(
+                        f"Multimodal input list length ({len(mm_input)}) must match prompts length ({num_prompts})"
+                    )
+                return mm_input
+            return [mm_input] * num_prompts
+
+        num_prompts = len(prompts)
+        audios_list = _normalize_mm_input(audios, num_prompts)
+        images_list = _normalize_mm_input(images, num_prompts)
+        videos_list = _normalize_mm_input(videos, num_prompts)
+
+        omni_inputs = []
+        for i, prompt_text in enumerate(prompts):
+            user_content = ""
+            multi_modal_data = {}
+
+            audio = audios_list[i]
+            if audio is not None:
+                if isinstance(audio, list):
+                    for _ in audio:
+                        user_content += f"<|audio_bos|>{audio_padding_token}<|audio_eos|>"
+                    multi_modal_data["audio"] = audio
+                else:
+                    user_content += f"<|audio_bos|>{audio_padding_token}<|audio_eos|>"
+                    multi_modal_data["audio"] = audio
+
+            image = images_list[i]
+            if image is not None:
+                if isinstance(image, list):
+                    for _ in image:
+                        user_content += f"<|vision_bos|>{image_padding_token}<|vision_eos|>"
+                    multi_modal_data["image"] = image
+                else:
+                    user_content += f"<|vision_bos|>{image_padding_token}<|vision_eos|>"
+                    multi_modal_data["image"] = image
+
+            video = videos_list[i]
+            if video is not None:
+                if isinstance(video, list):
+                    for _ in video:
+                        user_content += f"<|vision_bos|>{video_padding_token}<|vision_eos|>"
+                    multi_modal_data["video"] = video
+                else:
+                    user_content += f"<|vision_bos|>{video_padding_token}<|vision_eos|>"
+                    multi_modal_data["video"] = video
+
+            user_content += prompt_text
+
+            full_prompt = (
+                f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
+                f"<|im_start|>user\n{user_content}<|im_end|>\n"
+                f"<|im_start|>assistant\n"
+            )
+
+            input_dict: TextPrompt = {"prompt": full_prompt}
+            if multi_modal_data:
+                input_dict["multi_modal_data"] = multi_modal_data
+            if modalities:
+                input_dict["modalities"] = modalities
+            if mm_processor_kwargs:
+                input_dict["mm_processor_kwargs"] = mm_processor_kwargs
+
+            omni_inputs.append(input_dict)
+
+        return omni_inputs
+
+    def generate(
+        self,
+        prompts: list[TextPrompt],
+        sampling_params_list: list[OmniSamplingParams] | None = None,
+    ) -> list[OmniRequestOutput]:
+        """
+        Generate outputs for the given prompts.
+
+        Args:
+            prompts: List of prompt dictionaries with 'prompt' and optionally
+                    'multi_modal_data' keys
+            sampling_params_list: List of sampling parameters for each stage.
+                                 If None, uses default parameters.
+
+        Returns:
+            List of OmniRequestOutput objects from stages with final_output=True
+        """
+        if sampling_params_list is None:
+            sampling_params_list = self.get_default_sampling_params_list()
+
+        return self.omni.generate(prompts, sampling_params_list)
+
+    def generate_multimodal(
+        self,
+        prompts: list[str] | str,
+        sampling_params_list: list[OmniSamplingParams] | None = None,
+        system_prompt: str | None = None,
+        audios: PromptAudioInput = None,
+        images: PromptImageInput = None,
+        videos: PromptVideoInput = None,
+        mm_processor_kwargs: dict[str, Any] | None = None,
+        modalities: list[str] | None = None,
+    ) -> list[OmniRequestOutput]:
+        """
+        Convenience method to generate with multimodal inputs.
+
+        Args:
+            prompts: Text prompt(s)
+            sampling_params_list: List of sampling parameters for each stage
+            system_prompt: Optional system prompt
+            audios: Audio input(s)
+            images: Image input(s)
+            videos: Video input(s)
+            mm_processor_kwargs: Optional processor kwargs
+
+        Returns:
+            List of OmniRequestOutput objects from stages with final_output=True
+        """
+        omni_inputs = self.get_omni_inputs(
+            prompts=prompts,
+            system_prompt=system_prompt,
+            audios=audios,
+            images=images,
+            videos=videos,
+            mm_processor_kwargs=mm_processor_kwargs,
+            modalities=modalities,
+        )
+        return self.generate(omni_inputs, sampling_params_list)
+
+    def generate_audio(
+        self,
+        prompts: list[str] | str,
+        sampling_params_list: list[OmniSamplingParams] | None = None,
+        system_prompt: str | None = None,
+        audios: PromptAudioInput = None,
+        mm_processor_kwargs: dict[str, Any] | None = None,
+    ) -> list[OmniRequestOutput]:
+        """
+        Convenience method to generate with multimodal inputs.
+        Args:
+            prompts: Text prompt(s)
+            sampling_params_list: List of sampling parameters for each stage
+            system_prompt: Optional system prompt
+            audios: Audio input(s)
+            mm_processor_kwargs: Optional processor kwargs
+        Returns:
+            List of OmniRequestOutput objects from stages with final_output=True
+        """
+        omni_inputs = self.get_omni_inputs(
+            prompts=prompts,
+            system_prompt=system_prompt,
+            audios=audios,
+            mm_processor_kwargs=mm_processor_kwargs,
+        )
+        return self.generate(omni_inputs, sampling_params_list)
+
+    def generate_video(
+        self,
+        prompts: list[str] | str,
+        sampling_params_list: list[OmniSamplingParams] | None = None,
+        system_prompt: str | None = None,
+        videos: PromptVideoInput = None,
+        mm_processor_kwargs: dict[str, Any] | None = None,
+    ) -> list[OmniRequestOutput]:
+        """
+        Convenience method to generate with multimodal inputs.
+        Args:
+            prompts: Text prompt(s)
+            sampling_params_list: List of sampling parameters for each stage
+            system_prompt: Optional system prompt
+            videos: Video input(s)
+            mm_processor_kwargs: Optional processor kwargs
+        Returns:
+            List of OmniRequestOutput objects from stages with final_output=True
+        """
+        omni_inputs = self.get_omni_inputs(
+            prompts=prompts,
+            system_prompt=system_prompt,
+            videos=videos,
+            mm_processor_kwargs=mm_processor_kwargs,
+        )
+        return self.generate(omni_inputs, sampling_params_list)
+
+    def generate_image(
+        self,
+        prompts: list[str] | str,
+        sampling_params_list: list[OmniSamplingParams] | None = None,
+        system_prompt: str | None = None,
+        images: PromptImageInput = None,
+        mm_processor_kwargs: dict[str, Any] | None = None,
+    ) -> list[OmniRequestOutput]:
+        """
+        Convenience method to generate with multimodal inputs.
+        Args:
+            prompts: Text prompt(s)
+            sampling_params_list: List of sampling parameters for each stage
+            system_prompt: Optional system prompt
+            images: Image input(s)
+            mm_processor_kwargs: Optional processor kwargs
+        Returns:
+            List of OmniRequestOutput objects from stages with final_output=True
+        """
+        omni_inputs = self.get_omni_inputs(
+            prompts=prompts,
+            system_prompt=system_prompt,
+            images=images,
+            mm_processor_kwargs=mm_processor_kwargs,
+        )
+        return self.generate(omni_inputs, sampling_params_list)
+
+    def __enter__(self):
+        """Context manager entry."""
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit - cleanup resources."""
+        self.close()
+        del self.omni
+        cleanup_dist_env_and_memory()
+        _run_post_test_cleanup(enable_force=True)
+
+    def close(self):
+        """Close and cleanup the Omni instance."""
+        if hasattr(self.omni, "close"):
+            self.omni.close()
+
+
+@pytest.fixture(scope="session")
+def omni_runner():
+    return OmniRunner
--- a/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml
+++ b/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml
+# stage config for running BAGEL with Mooncake connector for CI e2e tests.
+# This config is optimized for single GPU tests with Mooncake inter-stage communication.
+
+stage_args:
+  - stage_id: 0
+    stage_type: llm
+    runtime:
+      devices: "0"
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: BagelForConditionalGeneration
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.35
+      enforce_eager: true
+      trust_remote_code: true
+      engine_output_type: text
+      distributed_executor_backend: mp
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+      tensor_parallel_size: 1
+      omni_kv_config:
+        need_send_cache: true
+        kv_transfer_criteria:
+          type: prefill_finished
+    final_output: true
+    final_output_type: text
+    is_comprehension: true
+    default_sampling_params:
+      temperature: 0.4
+      top_p: 0.9
+      top_k: 1
+      max_tokens: 2048
+      seed: 52
+      detokenize: true
+      repetition_penalty: 1.05
+    output_connectors:
+      to_stage_1: mooncake_connector
+  - stage_id: 1
+    stage_type: diffusion
+    runtime:
+      devices: "0"
+      max_batch_size: 1
+    engine_args:
+      model_stage: dit
+      gpu_memory_utilization: 0.55
+      enforce_eager: true
+      trust_remote_code: true
+      engine_output_type: image
+      distributed_executor_backend: mp
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+      tensor_parallel_size: 1
+      omni_kv_config:
+        need_recv_cache: true
+    engine_input_source: [0]
+    final_output: true
+    final_output_type: image
+    is_comprehension: false
+    default_sampling_params:
+      seed: 52
+    input_connectors:
+      from_stage_0: mooncake_connector
+
+# Top-level runtime config with Mooncake connector
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1
+    max_inflight: 1
+  connectors:
+    mooncake_connector:
+      name: MooncakeConnector
+      extra:
+        host: "${MOONCAKE_HOST}"
+        metadata_server: "http://${MOONCAKE_HOST}:${MOONCAKE_HTTP_PORT}/metadata"
+        master: "${MOONCAKE_HOST}:${MOONCAKE_RPC_PORT}"
+        segment: 64000000
+        localbuf: 64000000
+        proto: tcp
+  edges:
+    - from: 0
+      to: 1
+      window_size: -1
--- a/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml
+++ b/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml
+# stage config for running BAGEL with SharedMemory connector for CI e2e tests.
+# This config is optimized for single GPU tests with SharedMemory inter-stage communication.
+
+stage_args:
+  - stage_id: 0
+    stage_type: llm
+    runtime:
+      devices: "0"
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: BagelForConditionalGeneration
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.35
+      enforce_eager: true
+      trust_remote_code: true
+      engine_output_type: text
+      distributed_executor_backend: "mp"
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+      tensor_parallel_size: 1
+      omni_kv_config:
+        need_send_cache: true
+        kv_transfer_criteria:
+          type: prefill_finished #or special token generated
+    final_output: true
+    final_output_type: text
+    is_comprehension: true
+    default_sampling_params:
+      temperature: 0.4
+      top_p: 0.9
+      top_k: 1
+      max_tokens: 2048
+      seed: 52
+      detokenize: True
+      repetition_penalty: 1.05
+
+  - stage_id: 1
+    stage_type: diffusion
+    runtime:
+      devices: "0"
+      max_batch_size: 1
+    engine_args:
+      model_stage: dit
+      gpu_memory_utilization: 0.55
+      enforce_eager: true
+      trust_remote_code: true
+      engine_output_type: image
+      distributed_executor_backend: "mp"
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+      tensor_parallel_size: 1
+      omni_kv_config:
+        need_recv_cache: true
+    engine_input_source: [0]
+
+    final_output: true
+    final_output_type: image
+    is_comprehension: false
+    default_sampling_params:
+      seed: 52
+
+# Runtime edges
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1
+    max_inflight: 1
+
+  # Distributed connectors configuration (optional)
+  # More connectors will be supported in the future.
+  connectors:
+    shared_memory_connector:
+      name: SharedMemoryConnector
+      extra:
+        shm_threshold_bytes: 65536 # 64KB threshold
+
+
+  edges:
+    - from: 0
+      to: 1
+      window_size: -1
--- a/tests/e2e/offline_inference/stage_configs/npu/qwen2_5_omni_ci.yaml
+++ b/tests/e2e/offline_inference/stage_configs/npu/qwen2_5_omni_ci.yaml
+# stage config for running qwen2.5-omni with architecture of OmniLLM.
+
+# This config is optimized for CI e2e tests.
+stage_args:
+  - stage_id: 0
+    runtime:
+      process: true            # Run this stage in a separate process
+      devices: "0"
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      max_model_len: 896
+      max_num_batched_tokens: 896
+      max_num_seqs: 1
+      gpu_memory_utilization: 0.8
+      skip_mm_profiling: true
+      enforce_eager: true  # Now we only support eager mode
+      trust_remote_code: true
+      engine_output_type: latent
+      enable_prefix_caching: false
+    is_comprehension: true
+    final_output: true
+    final_output_type: text
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 128
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
+  - stage_id: 1
+    runtime:
+      process: true
+      devices: "1"
+      max_batch_size: 1
+    engine_args:
+      model_stage: talker
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      max_model_len: 896
+      max_num_batched_tokens: 896
+      max_num_seqs: 1
+      gpu_memory_utilization: 0.8
+      skip_mm_profiling: true
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: latent
+    engine_input_source: [0]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker
+    default_sampling_params:
+      temperature: 0.9
+      top_p: 0.8
+      top_k: 40
+      max_tokens: 128
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.05
+      stop_token_ids: [8294]
+  - stage_id: 2
+    runtime:
+      process: true
+      devices: "0"            # Example: use a different GPU than the previous stage; use "0" if single GPU
+      max_batch_size: 1
+    engine_args:
+      model_stage: code2wav
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_type: generation
+      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+      gpu_memory_utilization: 0.15
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: audio
+    engine_input_source: [1]
+    final_output: true
+    final_output_type: audio
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 128
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
+
+# Top-level runtime config (concise): default windows and stage edges
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1             # Simplified: trigger downstream only after full upstream completion
+    max_inflight: 1             # Simplified: process serially within each stage
+  edges:
+    - from: 0                   # thinker → talker: trigger only after receiving full input (-1)
+      to: 1
+      window_size: -1
+    - from: 1                   # talker → code2wav: trigger only after receiving full input (-1)
+      to: 2
+      window_size: -1
--- a/tests/e2e/offline_inference/stage_configs/qwen2_5_omni_ci.yaml
+++ b/tests/e2e/offline_inference/stage_configs/qwen2_5_omni_ci.yaml
+# stage config for running qwen2.5-omni with architecture of OmniLLM.
+
+# The following config has been verified on 2x 24GB GPU (L4/RTX3090/RTX4090).
+# This config is optimized for CI e2e tests.
+stage_args:
+  - stage_id: 0
+    runtime:
+      process: true            # Run this stage in a separate process
+      devices: "0"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      max_model_len: 896
+      max_num_batched_tokens: 896
+      max_num_seqs: 1
+      gpu_memory_utilization: 0.8
+      skip_mm_profiling: true
+      enforce_eager: true  # Now we only support eager mode
+      trust_remote_code: true
+      engine_output_type: latent
+      enable_prefix_caching: false
+    is_comprehension: true
+    final_output: true
+    final_output_type: text
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 128
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
+  - stage_id: 1
+    runtime:
+      process: true
+      devices: "1"
+      max_batch_size: 1
+    engine_args:
+      model_stage: talker
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      max_model_len: 896
+      max_num_batched_tokens: 896
+      max_num_seqs: 1
+      gpu_memory_utilization: 0.8
+      skip_mm_profiling: true
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: latent
+    engine_input_source: [0]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker
+    default_sampling_params:
+      temperature: 0.9
+      top_p: 0.8
+      top_k: 40
+      max_tokens: 128
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.05
+      stop_token_ids: [8294]
+  - stage_id: 2
+    runtime:
+      process: true
+      devices: "0"            # Example: use a different GPU than the previous stage; use "0" if single GPU
+      max_batch_size: 1
+    engine_args:
+      model_stage: code2wav
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_type: generation
+      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+      gpu_memory_utilization: 0.15
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: audio
+      max_num_batched_tokens: 4069
+    engine_input_source: [1]
+    final_output: true
+    final_output_type: audio
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 128
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
+
+# Top-level runtime config (concise): default windows and stage edges
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1             # Simplified: trigger downstream only after full upstream completion
+    max_inflight: 1             # Simplified: process serially within each stage
+  edges:
+    - from: 0                   # thinker → talker: trigger only after receiving full input (-1)
+      to: 1
+      window_size: -1
+    - from: 1                   # talker → code2wav: trigger only after receiving full input (-1)
+      to: 2
+      window_size: -1
--- a/tests/e2e/offline_inference/stage_configs/qwen3_omni_ci.yaml
+++ b/tests/e2e/offline_inference/stage_configs/qwen3_omni_ci.yaml
+# Stage config for running Qwen3-Omni-MoE with 3-stage architecture
+# Stage 0: Thinker (multimodal understanding + text generation)
+# Stage 1: Talker (text embeddings → 16-layer RVQ codec codes)
+# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform)
+
+# The following config has been verified on 2x H100-80G GPUs.
+stage_args:
+  - stage_id: 0
+    runtime:
+      devices: "0"
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: Qwen3OmniMoeForConditionalGeneration
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.9
+      enforce_eager: false
+      trust_remote_code: true
+      engine_output_type: latent  # Output hidden states for talker
+      distributed_executor_backend: "mp"
+      enable_prefix_caching: false
+      hf_config_name: thinker_config
+      tensor_parallel_size: 1
+      load_format: dummy
+    final_output: true
+    final_output_type: text
+    is_comprehension: true
+    default_sampling_params:
+      temperature: 0.4
+      top_p: 0.9
+      top_k: 1
+      max_tokens: 100
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.05
+
+  - stage_id: 1
+    runtime:
+       devices: "1"
+       max_batch_size: 1
+    engine_args:
+       model_stage: talker
+       model_arch: Qwen3OmniMoeForConditionalGeneration
+       worker_type: ar
+       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+       gpu_memory_utilization: 0.6
+       enforce_eager: true
+       trust_remote_code: true
+       engine_output_type: latent  # Output codec codes for code2wav
+      #  tensor_parallel_size: 2
+       enable_prefix_caching: false
+       distributed_executor_backend: "mp"
+       hf_config_name: talker_config
+       load_format: dummy
+    engine_input_source: [0]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker
+    # final_output: true
+    # final_output_type: text
+    default_sampling_params:
+      temperature: 0.9
+      top_k: 50
+      max_tokens: 100
+      seed: 42
+      detokenize: False
+      repetition_penalty: 1.05
+      stop_token_ids: [2150]
+
+  - stage_id: 2
+    runtime:
+      devices: "1"
+      max_batch_size: 1
+    engine_args:
+      model_stage: code2wav
+      model_arch: Qwen3OmniMoeForConditionalGeneration
+      worker_type: generation
+      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: audio  # Final output: audio waveform
+      gpu_memory_utilization: 0.1
+      distributed_executor_backend: "mp"
+      max_num_batched_tokens: 1000000
+      hf_config_name: thinker_config
+      load_format: dummy
+      async_scheduling: false
+    engine_input_source: [1]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav
+    final_output: true
+    final_output_type: audio
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 200
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
--- a/tests/e2e/offline_inference/stage_configs/rocm/qwen2_5_omni_ci.yaml
+++ b/tests/e2e/offline_inference/stage_configs/rocm/qwen2_5_omni_ci.yaml
+# stage config for running qwen2.5-omni with architecture of OmniLLM.
+
+# The following config has been verified on 2x 24GB GPU (L4/RTX3090/RTX4090).
+# This config is optimized for CI e2e tests.
+stage_args:
+  - stage_id: 0
+    runtime:
+      process: true            # Run this stage in a separate process
+      devices: "0"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      max_model_len: 896
+      max_num_batched_tokens: 896
+      max_num_seqs: 1
+      gpu_memory_utilization: 0.8
+      skip_mm_profiling: true
+      enforce_eager: true  # Now we only support eager mode
+      trust_remote_code: true
+      engine_output_type: latent
+      enable_prefix_caching: false
+    is_comprehension: true
+    final_output: true
+    final_output_type: text
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 128
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
+  - stage_id: 1
+    runtime:
+      process: true
+      devices: "1"
+      max_batch_size: 1
+    engine_args:
+      model_stage: talker
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      max_model_len: 896
+      max_num_batched_tokens: 896
+      max_num_seqs: 1
+      gpu_memory_utilization: 0.8
+      skip_mm_profiling: true
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: latent
+    engine_input_source: [0]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker
+    default_sampling_params:
+      temperature: 0.9
+      top_p: 0.8
+      top_k: 40
+      max_tokens: 128
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.05
+      stop_token_ids: [8294]
+  - stage_id: 2
+    runtime:
+      process: true
+      devices: "0"            # Example: use a different GPU than the previous stage; use "0" if single GPU
+      max_batch_size: 1
+    engine_args:
+      model_stage: code2wav
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_type: generation
+      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+      gpu_memory_utilization: 0.15
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: audio
+    engine_input_source: [1]
+    final_output: true
+    final_output_type: audio
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 128
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
+
+# Top-level runtime config (concise): default windows and stage edges
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1             # Simplified: trigger downstream only after full upstream completion
+    max_inflight: 1             # Simplified: process serially within each stage
+  edges:
+    - from: 0                   # thinker → talker: trigger only after receiving full input (-1)
+      to: 1
+      window_size: -1
+    - from: 1                   # talker → code2wav: trigger only after receiving full input (-1)
+      to: 2
+      window_size: -1
--- a/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml
+++ b/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml
+# Stage config for running Qwen3-Omni-MoE with 3-stage architecture
+# Stage 0: Thinker (multimodal understanding + text generation)
+# Stage 1: Talker (text embeddings → 16-layer RVQ codec codes)
+# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform)
+
+# The following config has been verified on 2x H100-80G GPUs.
+stage_args:
+  - stage_id: 0
+    runtime:
+      devices: "0"
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: Qwen3OmniMoeForConditionalGeneration
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.9
+      enforce_eager: false
+      trust_remote_code: true
+      engine_output_type: latent  # Output hidden states for talker
+      distributed_executor_backend: "mp"
+      enable_prefix_caching: false
+      hf_config_name: thinker_config
+      tensor_parallel_size: 1
+      load_format: dummy
+    final_output: true
+    final_output_type: text
+    is_comprehension: true
+    default_sampling_params:
+      temperature: 0.4
+      top_p: 0.9
+      top_k: 1
+      max_tokens: 100
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.05
+
+  - stage_id: 1
+    runtime:
+       devices: "1"
+       max_batch_size: 1
+    engine_args:
+       model_stage: talker
+       model_arch: Qwen3OmniMoeForConditionalGeneration
+       worker_type: ar
+       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+       gpu_memory_utilization: 0.6
+       enforce_eager: true
+       trust_remote_code: true
+       engine_output_type: latent  # Output codec codes for code2wav
+      #  tensor_parallel_size: 2
+       enable_prefix_caching: false
+       distributed_executor_backend: "mp"
+       hf_config_name: talker_config
+       load_format: dummy
+    engine_input_source: [0]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker
+    # final_output: true
+    # final_output_type: text
+    default_sampling_params:
+      temperature: 0.9
+      top_k: 50
+      max_tokens: 100
+      seed: 42
+      detokenize: False
+      repetition_penalty: 1.05
+      stop_token_ids: [2150]
+
+  - stage_id: 2
+    runtime:
+      devices: "1"
+      max_batch_size: 1
+    engine_args:
+      model_stage: code2wav
+      model_arch: Qwen3OmniMoeForConditionalGeneration
+      worker_type: generation
+      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: audio  # Final output: audio waveform
+      gpu_memory_utilization: 0.1
+      distributed_executor_backend: "mp"
+      max_num_batched_tokens: 1000000
+      hf_config_name: thinker_config
+      load_format: dummy
+      async_scheduling: false
+    engine_input_source: [1]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav
+    final_output: true
+    final_output_type: audio
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 200
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
--- a/tests/e2e/offline_inference/test_bagel_text2img.py
+++ b/tests/e2e/offline_inference/test_bagel_text2img.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+End-to-end test for Bagel text2img generation.
+
+This test validates that the Bagel model generates images that match
+expected reference pixel values within a ±5 tolerance.
+
+Equivalent to running:
+    python3 examples/offline_inference/bagel/end2end.py \
+        --prompts "A futuristic city skyline at twilight, cyberpunk style" \
+        --modality text2img --step 15
+"""
+
+import os
+
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1"
+
+import signal
+import socket
+import subprocess
+import tempfile
+import time
+from pathlib import Path
+from typing import Any
+
+import pytest
+from PIL import Image
+
+from tests.utils import hardware_test
+from vllm_omni.entrypoints.omni import Omni
+
+# Reference pixel data extracted from the known-good output image
+# Each entry contains (x, y) position and expected (R, G, B) values
+# "Generated with seed=52, num_inference_steps=15,
+# prompt='A futuristic city skyline at twilight, cyberpunk style'"
+REFERENCE_PIXELS = [
+    {"position": (100, 100), "rgb": (68, 107, 134)},
+    {"position": (400, 50), "rgb": (95, 139, 166)},
+    {"position": (700, 100), "rgb": (99, 122, 151)},
+    {"position": (150, 400), "rgb": (111, 125, 153)},
+    {"position": (512, 512), "rgb": (97, 107, 131)},
+    {"position": (700, 400), "rgb": (48, 64, 98)},
+    {"position": (100, 700), "rgb": (79, 63, 84)},
+    {"position": (400, 700), "rgb": (40, 58, 79)},
+    {"position": (700, 700), "rgb": (60, 75, 103)},
+    {"position": (256, 256), "rgb": (97, 128, 156)},
+]
+
+# Maximum allowed difference per color channel
+PIXEL_TOLERANCE = 5
+
+# Default test prompt
+DEFAULT_PROMPT = "<|im_start|>A futuristic city skyline at twilight, cyberpunk style<|im_end|>"
+
+
+def _find_free_port() -> int:
+    """Find and return a free ephemeral port by binding to port 0."""
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("127.0.0.1", 0))
+        s.listen(1)
+        port = s.getsockname()[1]
+    return port
+
+
+def _configure_sampling_params(omni: Omni, max_tokens: int = 1, num_inference_steps: int = 15) -> list:
+    """Configure sampling parameters for Bagel text2img generation.
+
+    Args:
+        omni: The Omni instance to get default params from.
+        max_tokens: Maximum tokens for the first stage.
+        num_inference_steps: Number of inference steps for the diffusion stage.
+
+    Returns:
+        Configured sampling params list.
+    """
+    params_list = omni.default_sampling_params_list
+    params_list[0].max_tokens = max_tokens  # type: ignore
+    if len(params_list) > 1:
+        params_list[1].num_inference_steps = num_inference_steps  # type: ignore
+    return params_list
+
+
+def _extract_generated_image(omni_outputs: list) -> Image.Image | None:
+    """Extract the generated image from Omni outputs.
+
+    Args:
+        omni_outputs: List of outputs from omni.generate().
+
+    Returns:
+        The first generated PIL Image, or None if no image found.
+    """
+    for req_output in omni_outputs:
+        if images := getattr(req_output, "images", None):
+            return images[0]
+        if hasattr(req_output, "request_output") and req_output.request_output:
+            for stage_out in req_output.request_output:
+                if hasattr(stage_out, "images") and stage_out.images:
+                    return stage_out.images[0]
+    return None
+
+
+def _validate_pixels(
+    image: Image.Image,
+    reference_pixels: list[dict[str, Any]] = REFERENCE_PIXELS,
+    tolerance: int = PIXEL_TOLERANCE,
+) -> None:
+    """Validate that image pixels match expected reference values.
+
+    Args:
+        image: The PIL Image to validate.
+        reference_pixels: List of dicts with 'position' (x, y) and 'rgb' (R, G, B).
+        tolerance: Maximum allowed difference per color channel.
+
+    Raises:
+        AssertionError: If any pixel differs beyond tolerance.
+    """
+    for ref in reference_pixels:
+        x, y = ref["position"]
+        expected = ref["rgb"]
+        actual = image.getpixel((x, y))[:3]
+        assert all(abs(a - e) <= tolerance for a, e in zip(actual, expected)), (
+            f"Pixel mismatch at ({x}, {y}): expected {expected}, got {actual}"
+        )
+
+
+def _generate_bagel_image(omni: Omni, prompt: str = DEFAULT_PROMPT) -> Image.Image:
+    """Generate an image using Bagel model with configured parameters.
+
+    Args:
+        omni: The Omni instance to use for generation.
+        prompt: The text prompt for image generation.
+
+    Returns:
+        The generated PIL Image.
+
+    Raises:
+        AssertionError: If no image is generated or size is incorrect.
+    """
+    params_list = _configure_sampling_params(omni)
+
+    omni_outputs = list(
+        omni.generate(
+            prompts=[{"prompt": prompt, "modalities": ["image"]}],
+            sampling_params_list=params_list,
+        )
+    )
+
+    generated_image = _extract_generated_image(omni_outputs)
+    assert generated_image is not None, "No images generated"
+    assert generated_image.size == (1024, 1024), f"Expected 1024x1024, got {generated_image.size}"
+
+    return generated_image
+
+
+@pytest.mark.core_model
+@pytest.mark.diffusion
+@hardware_test(res={"cuda": "H100"})
+def test_bagel_text2img_shared_memory_connector():
+    """Test Bagel text2img with shared memory connector."""
+    config_path = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml")
+    omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=config_path, stage_init_timeout=300)
+
+    try:
+        generated_image = _generate_bagel_image(omni)
+        _validate_pixels(generated_image)
+    finally:
+        omni.close()
+
+
+def _wait_for_port(host: str, port: int, timeout: int = 30) -> bool:
+    """Wait for a port to become available.
+
+    Args:
+        host: The host address.
+        port: The port number.
+        timeout: Maximum seconds to wait.
+
+    Returns:
+        True if port becomes available, False otherwise.
+    """
+    for _ in range(timeout):
+        try:
+            with socket.create_connection((host, port), timeout=1):
+                return True
+        except (TimeoutError, ConnectionRefusedError):
+            time.sleep(1)
+    return False
+
+
+def _cleanup_mooncake_processes(timeout_secs: int = 5) -> None:
+    """Clean up any existing mooncake_master processes.
+
+    Args:
+        timeout_secs: Maximum seconds to wait for graceful termination.
+    """
+    subprocess.run(
+        ["pkill", "-f", "mooncake_master"],
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+    start_time = time.time()
+    while time.time() - start_time < timeout_secs:
+        result = subprocess.run(
+            ["pgrep", "-f", "mooncake_master"],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+        if result.returncode != 0:
+            break
+        time.sleep(0.5)
+    else:
+        subprocess.run(
+            ["pkill", "-9", "-f", "mooncake_master"],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+    time.sleep(1)
+
+
+def _load_mooncake_config(host: str, rpc_port: int, http_port: int) -> str:
+    """Load Mooncake config from YAML and substitute placeholders.
+
+    Args:
+        host: Mooncake host address.
+        rpc_port: RPC port for Mooncake master.
+        http_port: HTTP metadata server port.
+
+    Returns:
+        Path to the temporary config file with substituted values.
+    """
+    config_path = str(Path(__file__).parent / "stage_configs" / "bagel_mooncake_ci.yaml")
+    with open(config_path) as f:
+        config_content = f.read()
+
+    # Substitute placeholders
+    config_content = config_content.replace("${MOONCAKE_HOST}", host)
+    config_content = config_content.replace("${MOONCAKE_RPC_PORT}", str(rpc_port))
+    config_content = config_content.replace("${MOONCAKE_HTTP_PORT}", str(http_port))
+
+    # Write to temp file
+    temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False)
+    temp_file.write(config_content)
+    temp_file.close()
+    return temp_file.name
+
+
+@pytest.mark.core_model
+@pytest.mark.diffusion
+@hardware_test(res={"cuda": "H100"})
+def test_bagel_text2img_mooncake_connector():
+    """Test Bagel text2img with Mooncake connector for inter-stage communication."""
+    MOONCAKE_HOST = "127.0.0.1"
+    MOONCAKE_RPC_PORT = _find_free_port()
+    MOONCAKE_HTTP_PORT = _find_free_port()
+    MOONCAKE_METRICS_PORT = _find_free_port()
+
+    mooncake_master_proc = None
+    temp_config_file = None
+    omni = None
+
+    try:
+        _cleanup_mooncake_processes()
+
+        # Start mooncake_master
+        mooncake_master_proc = subprocess.Popen(
+            [
+                "mooncake_master",
+                f"--rpc_port={MOONCAKE_RPC_PORT}",
+                "--enable_http_metadata_server=true",
+                "--http_metadata_server_host=0.0.0.0",
+                f"--http_metadata_server_port={MOONCAKE_HTTP_PORT}",
+                f"--metrics_port={MOONCAKE_METRICS_PORT}",
+            ],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+            preexec_fn=os.setsid,
+        )
+
+        assert _wait_for_port(MOONCAKE_HOST, MOONCAKE_RPC_PORT), "mooncake_master failed to start"
+
+        # Create temp config and initialize Omni
+        temp_config_file = _load_mooncake_config(
+            host=MOONCAKE_HOST,
+            rpc_port=MOONCAKE_RPC_PORT,
+            http_port=MOONCAKE_HTTP_PORT,
+        )
+
+        omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=temp_config_file, stage_init_timeout=300)
+
+        generated_image = _generate_bagel_image(omni)
+        _validate_pixels(generated_image)
+
+    finally:
+        if omni:
+            omni.close()
+        if temp_config_file:
+            try:
+                os.unlink(temp_config_file)
+            except OSError:
+                pass
+        if mooncake_master_proc:
+            try:
+                os.killpg(os.getpgid(mooncake_master_proc.pid), signal.SIGKILL)
+            except OSError:
+                pass
--- a/tests/e2e/offline_inference/test_cache_dit.py
+++ b/tests/e2e/offline_inference/test_cache_dit.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+System test for cache-dit backend.
+
+This test verifies that cache-dit acceleration works correctly with diffusion models.
+It uses minimal settings to keep test time short for CI.
+"""
+
+import os
+import sys
+from pathlib import Path
+
+import pytest
+import torch
+
+from vllm_omni.inputs.data import OmniDiffusionSamplingParams
+
+# ruff: noqa: E402
+REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
+from vllm_omni import Omni
+from vllm_omni.outputs import OmniRequestOutput
+
+os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1"
+
+# Use random weights model for testing
+models = ["riverclouds/qwen_image_random"]
+
+
+@pytest.mark.parametrize("model_name", models)
+def test_cache_dit(model_name: str):
+    """Test cache-dit backend with diffusion model."""
+    # Configure cache-dit with minimal settings for fast testing
+    cache_config = {
+        "Fn_compute_blocks": 1,
+        "Bn_compute_blocks": 0,
+        "max_warmup_steps": 2,  # Minimal warmup for fast test
+        "residual_diff_threshold": 0.24,
+        "max_continuous_cached_steps": 3,
+    }
+    m = None
+    try:
+        m = Omni(
+            model=model_name,
+            cache_backend="cache_dit",
+            cache_config=cache_config,
+        )
+
+        # Use minimal settings for fast testing
+        height = 256
+        width = 256
+        num_inference_steps = 4  # Minimal steps for fast test
+
+        outputs = m.generate(
+            "a photo of a cat sitting on a laptop keyboard",
+            OmniDiffusionSamplingParams(
+                height=height,
+                width=width,
+                num_inference_steps=num_inference_steps,
+                guidance_scale=0.0,
+                generator=torch.Generator("cuda").manual_seed(42),
+                num_outputs_per_prompt=1,  # Single output for speed
+            ),
+        )
+        # Extract images from request_output[0]['images']
+        first_output = outputs[0]
+        assert first_output.final_output_type == "image"
+        if not hasattr(first_output, "request_output") or not first_output.request_output:
+            raise ValueError("No request_output found in OmniRequestOutput")
+
+        req_out = first_output.request_output[0]
+        if not isinstance(req_out, OmniRequestOutput) or not hasattr(req_out, "images"):
+            raise ValueError("Invalid request_output structure or missing 'images' key")
+
+        images = req_out.images
+
+        # Verify generation succeeded
+        assert images is not None
+        assert len(images) == 1
+        # Check image size
+        assert images[0].width == width
+        assert images[0].height == height
+    except Exception as e:
+        print(f"Test failed with error: {e}")
+        raise
+    finally:
+        if m is not None and hasattr(m, "close"):
+            m.close()
--- a/tests/e2e/offline_inference/test_diffusion_cpu_offload.py
+++ b/tests/e2e/offline_inference/test_diffusion_cpu_offload.py
+import sys
+from pathlib import Path
+
+import pytest
+import torch
+from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
+
+from tests.utils import GPUMemoryMonitor
+from vllm_omni.inputs.data import OmniDiffusionSamplingParams
+from vllm_omni.platforms import current_omni_platform
+
+# ruff: noqa: E402
+REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
+from vllm_omni import Omni
+
+models = ["riverclouds/qwen_image_random"]
+
+
+def inference(model_name: str, offload: bool = True):
+    current_omni_platform.empty_cache()
+    device_index = torch.cuda.current_device()
+    monitor = GPUMemoryMonitor(device_index=device_index, interval=0.02)
+    monitor.start()
+    m = Omni(model=model_name, enable_cpu_offload=offload)
+    torch.cuda.reset_peak_memory_stats(device=device_index)
+    height = 256
+    width = 256
+
+    m.generate(
+        "a photo of a cat sitting on a laptop keyboard",
+        OmniDiffusionSamplingParams(
+            height=height,
+            width=width,
+            num_inference_steps=9,
+            guidance_scale=0.0,
+            generator=torch.Generator("cuda").manual_seed(42),
+        ),
+    )
+    peak = monitor.peak_used_mb
+    monitor.stop()
+
+    return peak
+
+
+@pytest.mark.skipif(current_omni_platform.is_npu() or current_omni_platform.is_rocm(), reason="Hardware not supported")
+@pytest.mark.parametrize("model_name", models)
+def test_cpu_offload_diffusion_model(model_name: str):
+    try:
+        no_offload_peak_memory = inference(model_name, offload=False)
+        cleanup_dist_env_and_memory()
+        offload_peak_memory = inference(model_name, offload=True)
+    except Exception:
+        pytest.fail("Inference failed")
+    print(f"Offload peak memory: {offload_peak_memory} MB")
+    print(f"No offload peak memory: {no_offload_peak_memory} MB")
+    assert offload_peak_memory + 2500 < no_offload_peak_memory, (
+        f"Offload peak memory {offload_peak_memory} MB should be less than no offload peak memory {no_offload_peak_memory} MB"
+    )
--- a/tests/e2e/offline_inference/test_diffusion_layerwise_offload.py
+++ b/tests/e2e/offline_inference/test_diffusion_layerwise_offload.py
+import sys
+from pathlib import Path
+
+import pytest
+import torch
+from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
+
+from tests.utils import GPUMemoryMonitor
+from vllm_omni.inputs.data import OmniDiffusionSamplingParams
+from vllm_omni.platforms import current_omni_platform
+
+# ruff: noqa: E402
+REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
+from vllm_omni import Omni
+
+# Models to test and expected saved memory in MB, correspondingly
+MODELS_SAVED_MEMORY_MB = {"riverclouds/qwen_image_random": 4500}
+
+
+def run_inference(
+    model_name: str,
+    layerwise_offload: bool = False,
+    num_gpu_layers: int = 1,
+    num_inference_steps: int = 3,
+) -> float:
+    # For now, only support on GPU, so apply torch.cuda operations here
+    # NPU / ROCm platforms are expected to be detected and skipped this test function
+    torch.cuda.empty_cache()
+    device_index = torch.cuda.current_device()
+    monitor = GPUMemoryMonitor(device_index=device_index, interval=0.02)
+    monitor.start()
+
+    m = Omni(
+        model=model_name,
+        enable_layerwise_offload=layerwise_offload,
+        layerwise_num_gpu_layers=num_gpu_layers,
+        boundary_ratio=0.875,
+        flow_shift=5.0,
+    )
+
+    torch.cuda.reset_peak_memory_stats(device=device_index)
+
+    # Refer to tests/e2e/offline_inference/test_t2v_model.py
+    # Use minimal settings for testing
+    height = 480
+    width = 640
+    num_frames = 5
+
+    m.generate(
+        "A cat sitting on a table",
+        OmniDiffusionSamplingParams(
+            height=height,
+            width=width,
+            generator=torch.Generator("cuda").manual_seed(42),
+            guidance_scale=1.0,
+            num_inference_steps=num_inference_steps,
+            num_frames=num_frames,
+        ),
+    )
+
+    peak = monitor.peak_used_mb
+    monitor.stop()
+
+    return peak
+
+
+@pytest.mark.skipif(current_omni_platform.is_npu() or current_omni_platform.is_rocm(), reason="Hardware not supported")
+@pytest.mark.parametrize("model_name", MODELS_SAVED_MEMORY_MB.keys())
+def test_layerwise_offload_diffusion_model(model_name: str):
+    """Test that layerwise offloading reduces GPU memory usage.
+
+    This test verifies that layerwise offloading significantly reduces peak
+    GPU memory usage compared to loading the entire model on GPU. The layerwise
+    offloader keeps only a single transformer block on GPU at a time, with
+    prefetching for compute-memory overlap.
+    """
+    try:
+        # Run without layerwise offloading (baseline)
+        no_offload_peak_memory = run_inference(model_name, layerwise_offload=False)
+        cleanup_dist_env_and_memory()
+
+        # Run with layerwise offloading (1 layer on device)
+        layerwise_offload_peak_memory = run_inference(model_name, layerwise_offload=True, num_gpu_layers=1)
+        cleanup_dist_env_and_memory()
+
+        # Run with 2 layers on device
+        layerwise_offload_two_layers_peak = run_inference(model_name, layerwise_offload=True, num_gpu_layers=2)
+    except Exception:
+        pytest.fail("Inference failed")
+
+    print(f"Layerwise offload peak memory (1 GPU layer): {layerwise_offload_peak_memory} MB")
+    print(f"Layerwise offload peak memory (2 GPU layers): {layerwise_offload_two_layers_peak} MB")
+    print(f"No offload peak memory: {no_offload_peak_memory} MB")
+
+    # Verify that layerwise offloading significantly reduces memory usage
+    # Passes only if the actual savings exceeds the expected savings
+    assert layerwise_offload_peak_memory + MODELS_SAVED_MEMORY_MB[model_name] < no_offload_peak_memory, (
+        f"Layerwise offload peak memory {layerwise_offload_peak_memory} MB "
+        f"should be significantly less than no offload peak memory {no_offload_peak_memory} MB"
+    )
+
+    # Verify that 2 GPU layers uses more memory than 1 GPU layer
+    # But not excessively more (should be a reasonable increase)
+    assert layerwise_offload_peak_memory < layerwise_offload_two_layers_peak, (
+        f"1 GPU layer peak {layerwise_offload_peak_memory} MB should be < "
+        f"2 GPU layers peak {layerwise_offload_two_layers_peak} MB"
+    )
--- a/tests/e2e/offline_inference/test_diffusion_lora.py
+++ b/tests/e2e/offline_inference/test_diffusion_lora.py
+import json
+import os
+import sys
+from pathlib import Path
+
+import pytest
+import torch
+from safetensors.torch import save_file
+
+from vllm_omni.inputs.data import OmniDiffusionSamplingParams
+from vllm_omni.outputs import OmniRequestOutput
+
+# ruff: noqa: E402
+REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
+from vllm_omni import Omni
+
+os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1"
+
+
+# This test is specific to Z-Image LoRA behavior. Keep it focused on a single
+# model to reduce runtime and avoid extra downloads.
+models = ["Tongyi-MAI/Z-Image-Turbo"]
+
+
+@pytest.mark.parametrize("model_name", models)
+def test_diffusion_model(model_name: str, tmp_path: Path):
+    def _extract_images(outputs: list[OmniRequestOutput]):
+        if not outputs:
+            raise ValueError("Empty outputs from Omni.generate()")
+        first_output = outputs[0]
+        assert first_output.final_output_type == "image"
+        if not hasattr(first_output, "request_output") or not first_output.request_output:
+            raise ValueError("No request_output found in OmniRequestOutput")
+
+        req_out = first_output.request_output[0]
+        if not isinstance(req_out, OmniRequestOutput) or not hasattr(req_out, "images"):
+            raise ValueError("Invalid request_output structure or missing 'images' key")
+        return req_out.images
+
+    def _write_zimage_lora(adapter_dir: Path) -> str:
+        adapter_dir.mkdir(parents=True, exist_ok=True)
+
+        # Z-Image transformer uses dim=3840 by default (see ZImageTransformer2DModel).
+        dim = 3840
+        module_name = "transformer.layers.0.attention.to_qkv"
+        rank = 1
+        lora_a = torch.zeros((rank, dim), dtype=torch.float32)
+        lora_a[0, 0] = 1.0
+
+        # QKVParallelLinear packs (Q, K, V). With tp=1 and n_kv_heads==n_heads in Z-Image,
+        # each slice is `dim`, so total out dim is `3 * dim`.
+        lora_b = torch.zeros((3 * dim, rank), dtype=torch.float32)
+        # Apply a visible delta to the Q slice only to keep the perturbation bounded.
+        lora_b[:dim, 0] = 0.1
+
+        save_file(
+            {
+                f"base_model.model.{module_name}.lora_A.weight": lora_a,
+                f"base_model.model.{module_name}.lora_B.weight": lora_b,
+            },
+            str(adapter_dir / "adapter_model.safetensors"),
+        )
+        (adapter_dir / "adapter_config.json").write_text(
+            json.dumps(
+                {
+                    "r": rank,
+                    "lora_alpha": rank,
+                    "target_modules": [module_name],
+                }
+            ),
+            encoding="utf-8",
+        )
+        return str(adapter_dir)
+
+    m = Omni(model=model_name)
+    try:
+        # high resolution may cause OOM on L4
+        height = 256
+        width = 256
+        prompt = "a photo of a cat sitting on a laptop keyboard"
+
+        outputs = m.generate(
+            prompt,
+            OmniDiffusionSamplingParams(
+                height=height,
+                width=width,
+                num_inference_steps=2,
+                guidance_scale=0.0,
+                generator=torch.Generator("cuda").manual_seed(42),
+                num_outputs_per_prompt=1,
+            ),
+        )
+        images = _extract_images(outputs)
+
+        assert len(images) == 1
+        # check image size
+        assert images[0].width == width
+        assert images[0].height == height
+
+        # Real LoRA E2E: generate again with a real on-disk PEFT adapter and
+        # verify that output changes.
+        if model_name == "Tongyi-MAI/Z-Image-Turbo":
+            from vllm_omni.lora.request import LoRARequest
+            from vllm_omni.lora.utils import stable_lora_int_id
+
+            lora_dir = _write_zimage_lora(tmp_path / "zimage_lora")
+            lora_request = LoRARequest(
+                lora_name="test",
+                lora_int_id=stable_lora_int_id(lora_dir),
+                lora_path=lora_dir,
+            )
+            outputs_lora = m.generate(
+                prompt,
+                OmniDiffusionSamplingParams(
+                    height=height,
+                    width=width,
+                    num_inference_steps=2,
+                    guidance_scale=0.0,
+                    generator=torch.Generator("cuda").manual_seed(42),
+                    num_outputs_per_prompt=1,
+                    lora_request=lora_request,
+                    lora_scale=2.0,
+                ),
+            )
+            images_lora = _extract_images(outputs_lora)
+            assert len(images_lora) == 1
+            assert images_lora[0].width == width
+            assert images_lora[0].height == height
+
+            import numpy as np
+
+            diff = np.abs(np.array(images[0], dtype=np.int16) - np.array(images_lora[0], dtype=np.int16)).mean()
+            assert diff > 0.0
+    finally:
+        m.close()
--- a/tests/e2e/offline_inference/test_ovis_image.py
+++ b/tests/e2e/offline_inference/test_ovis_image.py
+"""
+Tests for Ovis Image model pipeline.
+
+Strategy:
+1. `mock_dependencies` fixture mocks heavy external components (VAE, Scheduler, TextEncoder)
+   to allow fast testing of the pipeline logic without downloading weights.
+   - Mocks are configured to return tensors on the correct device.
+   - Transformer is mocked dynamically to return random noise of correct shape.
+
+2. `test_real_transformer_init_and_forward` tests the actual `OvisImageTransformer2DModel`
+   initialization and forward pass with a small configuration to ensure code coverage
+   and correctness of the model definition itself, independent of the pipeline mocks.
+"""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+
+from vllm_omni.diffusion.data import OmniDiffusionConfig, TransformerConfig
+
+# Mock the OvisImageTransformer2DModel to avoid complex init if needed,
+# or let it run if it's lightweight. It's likely not lightweight.
+# Better to mock the transformer forwarding to return random noise.
+from vllm_omni.diffusion.distributed.utils import get_local_device
+from vllm_omni.diffusion.models.ovis_image.pipeline_ovis_image import OvisImagePipeline
+from vllm_omni.diffusion.request import OmniDiffusionRequest
+from vllm_omni.inputs.data import OmniDiffusionSamplingParams
+
+
+@pytest.fixture
+def mock_dependencies(monkeypatch):
+    """
+    Mock external dependencies to avoid loading real models.
+    """
+    device = get_local_device()
+
+    # Mock Tokenizer
+    mock_tokenizer = MagicMock()
+    mock_tokenizer.return_value = MagicMock(
+        input_ids=torch.zeros((1, 50), dtype=torch.long, device=device),
+        attention_mask=torch.ones((1, 50), dtype=torch.long, device=device),
+    )
+    mock_tokenizer.apply_chat_template.return_value = "dummy prompt"
+    mock_tokenizer.model_max_length = 1024
+
+    # Mock Text Encoder
+    mock_text_encoder = MagicMock()
+    mock_text_encoder.dtype = torch.float32
+    # Output of text encoder must be on the same device as inputs (which are moved to execution_device)
+    mock_text_encoder.return_value.last_hidden_state = torch.randn(1, 50, 32, device=device)
+
+    # Mock VAE
+    mock_vae = MagicMock()
+    mock_vae.config.block_out_channels = [128, 256, 512, 512]  # Scale factor 8
+    mock_vae.config.scale_factor_temporal = 1
+    mock_vae.config.scale_factor_spatial = 8
+    mock_vae.config.scaling_factor = 0.18215
+    mock_vae.config.shift_factor = 0.0
+    # Decode return value
+    mock_vae.decode.return_value = [torch.randn(1, 3, 128, 128, device=device)]
+    # Ensure .to() returns self so configuration persists
+    mock_vae.to.return_value = mock_vae
+
+    # Mock Scheduler
+    mock_scheduler = MagicMock()
+    mock_scheduler.config = MagicMock()
+    # Timesteps on device to match latents during denoising loop interaction if needed
+    mock_scheduler.timesteps = torch.tensor([1.0, 0.5, 0.0], device=device)
+    mock_scheduler.set_timesteps.return_value = None
+
+    # Make step return dynamic based on input sample shape
+    def mock_scheduler_step(model_output, timestep, sample, **kwargs):
+        # sample is the latents, should be preserved
+        return (torch.randn_like(sample),)
+
+    mock_scheduler.step.side_effect = mock_scheduler_step
+
+    module_path = "vllm_omni.diffusion.models.ovis_image.pipeline_ovis_image"
+
+    monkeypatch.setattr(f"{module_path}.Qwen2TokenizerFast.from_pretrained", lambda *a, **k: mock_tokenizer)
+    monkeypatch.setattr(f"{module_path}.Qwen3Model.from_pretrained", lambda *a, **k: mock_text_encoder)
+    monkeypatch.setattr(f"{module_path}.AutoencoderKL.from_pretrained", lambda *a, **k: mock_vae)
+    monkeypatch.setattr(
+        f"{module_path}.FlowMatchEulerDiscreteScheduler.from_pretrained", lambda *a, **k: mock_scheduler
+    )
+
+    return {
+        "tokenizer": mock_tokenizer,
+        "text_encoder": mock_text_encoder,
+        "vae": mock_vae,
+        "scheduler": mock_scheduler,
+        "device": device,
+    }
+
+
+@pytest.fixture
+def ovis_pipeline(mock_dependencies, monkeypatch):
+    """
+    Creates an OvisImagePipeline instance with mocked components.
+    """
+    # Create config
+    tf_config = TransformerConfig(
+        params={
+            "in_channels": 4,
+            "out_channels": 4,
+            "sample_size": 32,
+            "patch_size": 2,
+            "num_attention_heads": 4,
+            "attention_head_dim": 8,
+            "num_layers": 1,
+            "caption_channels": 32,
+        }
+    )
+
+    od_config = OmniDiffusionConfig(
+        model="dummy-ovis",
+        tf_model_config=tf_config,
+        dtype=torch.float32,
+        num_gpus=1,
+    )
+
+    # Mock Transformer Layer separately to avoid full init
+    # We patch OvisImageTransformer2DModel class in the module
+    mock_transformer_cls = MagicMock()
+    mock_transformer_instance = MagicMock()
+    mock_transformer_instance.dtype = torch.float32
+    mock_transformer_instance.in_channels = 16  # Must be 16 so num_channel_latents=4, packed=16
+    # Forward return: noise prediction
+
+    def mock_forward(hidden_states, *args, **kwargs):
+        # hidden_states shape: (B, SeqLen, Channels)
+        return (torch.randn_like(hidden_states),)
+
+    mock_transformer_instance.forward.side_effect = mock_forward
+    # Also make the instance itself callable to mimic __call__
+    mock_transformer_instance.side_effect = mock_forward
+
+    mock_transformer_cls.return_value = mock_transformer_instance
+
+    monkeypatch.setattr(
+        "vllm_omni.diffusion.models.ovis_image.pipeline_ovis_image.OvisImageTransformer2DModel", mock_transformer_cls
+    )
+
+    # Initialize pipeline
+    # We use a dummy model path check override
+    with patch("os.path.exists", return_value=True):
+        pipeline = OvisImagePipeline(od_config=od_config)
+
+    return pipeline
+
+
+def test_interface_compliance(ovis_pipeline):
+    """Verify methods required by vllm-omni framework."""
+    assert hasattr(ovis_pipeline, "load_weights")
+    assert hasattr(ovis_pipeline, "scheduler")
+    assert hasattr(ovis_pipeline, "transformer")
+    assert hasattr(ovis_pipeline, "text_encoder")
+    # assert hasattr(ovis_pipeline, "vae") # Ovis uses VAE
+
+
+def test_basic_generation(ovis_pipeline):
+    """Test the forward pass logic."""
+    # Setup request
+    req = OmniDiffusionRequest(
+        prompts=["A photo of a cat"],
+        sampling_params=OmniDiffusionSamplingParams(
+            height=256,
+            width=256,
+            num_inference_steps=2,
+            guidance_scale=1.0,
+        ),
+    )
+
+    output = ovis_pipeline(req)
+
+    assert output is not None
+    assert output.output is not None
+    # Output should be a tensor from mocked VAE decode [torch.randn(1, 3, 128, 128)]
+    assert isinstance(output.output, torch.Tensor)
+    assert output.output.shape == (1, 3, 128, 128)
+
+    # Check that transformer was called
+    assert ovis_pipeline.transformer.call_count > 0
+
+
+def test_guidance_scale(ovis_pipeline):
+    """Test that classifier-free guidance path is taken when scale > 1.0."""
+    req = OmniDiffusionRequest(
+        prompts=[
+            {
+                "prompt": "A photo of a cat",
+                "negative_prompt": "bad quality",
+            }
+        ],
+        sampling_params=OmniDiffusionSamplingParams(
+            height=256,
+            width=256,
+            num_inference_steps=1,
+            guidance_scale=2.0,  # Trigger CFG
+        ),
+    )
+
+    ovis_pipeline(req)
+    assert ovis_pipeline.transformer.call_count >= 2
+
+
+def test_resolution_check(ovis_pipeline):
+    """Test resolution divisible validation logic if present."""
+    # Pass odd resolution
+    req = OmniDiffusionRequest(
+        prompts=["test"],
+        sampling_params=OmniDiffusionSamplingParams(
+            height=250,  # Not divisible by 16 (8*2)
+            width=250,
+        ),
+    )
+
+    # Should warn but proceed (as per code I read earlier) or resize?
+    # The code had `logger.warning(...)`
+
+    output = ovis_pipeline(req)
+    assert output is not None
+
+
+def test_real_transformer_init_and_forward():
+    """Test the real OvisImageTransformer2DModel initialization and forward pass for coverage."""
+    from unittest.mock import patch
+
+    from vllm_omni.diffusion.models.ovis_image.ovis_image_transformer import OvisImageTransformer2DModel
+
+    device = get_local_device()
+    tf_config = TransformerConfig(
+        params={
+            "patch_size": 2,
+            "in_channels": 16,
+            "out_channels": 16,
+            "num_layers": 1,
+            "num_single_layers": 1,
+            "attention_head_dim": 8,
+            "num_attention_heads": 2,
+            "joint_attention_dim": 32,
+            "axes_dims_rope": (4, 4, 4),
+        }
+    )
+
+    od_config = OmniDiffusionConfig(model="dummy-ovis", tf_model_config=tf_config, dtype=torch.bfloat16, num_gpus=1)
+    torch.set_default_dtype(torch.bfloat16)
+
+    # Mock distributed state for QKVParallelLinear initialization
+    # We patch get_tp_group because get_tensor_model_parallel_rank calls it and asserts _TP is not None
+    mock_group = MagicMock()
+    mock_group.rank_in_group = 0
+    mock_group.world_size = 1
+
+    with patch("vllm.distributed.parallel_state.get_tp_group", return_value=mock_group):
+        # Initialize real model
+        model = OvisImageTransformer2DModel(
+            od_config=od_config,
+            patch_size=1,
+            in_channels=16,
+            out_channels=16,
+            num_single_layers=1,
+            attention_head_dim=8,
+            num_attention_heads=2,
+            joint_attention_dim=32,
+            axes_dims_rope=(2, 2, 4),
+        ).to(device)
+
+        # Create dummy inputs
+        B, Seq, C = 1, 16, 16
+        hidden_states = torch.randn(B, Seq, C, device=device)
+        encoder_hidden_states = torch.randn(B, 10, 32, device=device)  # joint_attention_dim=32
+        timestep = torch.tensor([1], device=device)
+        img_ids = torch.zeros(Seq, 3, device=device)
+        txt_ids = torch.zeros(10, 3, device=device)
+
+        # Run forward
+        output = model(
+            hidden_states=hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            timestep=timestep,
+            img_ids=img_ids,
+            txt_ids=txt_ids,
+            return_dict=False,
+        )
+
+        assert output is not None
+        assert isinstance(output, tuple)
+        assert output[0].shape == hidden_states.shape
--- a/tests/e2e/offline_inference/test_qwen2_5_omni.py
+++ b/tests/e2e/offline_inference/test_qwen2_5_omni.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+E2E tests for Qwen2.5-Omni model with mixed modality inputs and audio output.
+"""
+
+from pathlib import Path
+
+import pytest
+from vllm.assets.audio import AudioAsset
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+from vllm.envs import VLLM_USE_MODELSCOPE
+from vllm.multimodal.image import convert_image_mode
+
+from vllm_omni.platforms import current_omni_platform
+
+from .conftest import OmniRunner
+from .utils import create_new_process_for_each_test
+
+models = ["Qwen/Qwen2.5-Omni-3B"]
+
+# CI stage config optimized for 24GB GPU (L4/RTX3090) or NPU
+if current_omni_platform.is_npu():
+    stage_config = str(Path(__file__).parent / "stage_configs" / "npu" / "qwen2_5_omni_ci.yaml")
+elif current_omni_platform.is_rocm():
+    # ROCm stage config optimized for MI325 GPU
+    stage_config = str(Path(__file__).parent / "stage_configs" / "rocm" / "qwen2_5_omni_ci.yaml")
+else:
+    stage_config = str(Path(__file__).parent / "stage_configs" / "qwen2_5_omni_ci.yaml")
+
+# Create parameter combinations for model and stage config
+test_params = [(model, stage_config) for model in models]
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("test_config", test_params)
+@create_new_process_for_each_test("spawn")
+def test_mixed_modalities_to_audio(omni_runner: type[OmniRunner], test_config: tuple[str, str]) -> None:
+    """Test processing audio, image, and video together, generating audio output."""
+    model, stage_config_path = test_config
+    with omni_runner(model, seed=42, stage_configs_path=stage_config_path) as runner:
+        # Prepare multimodal inputs
+        question = "What is recited in the audio? What is in this image? Describe the video briefly."
+        audio = AudioAsset("mary_had_lamb").audio_and_sample_rate
+        audio = (audio[0][: 16000 * 5], audio[1])  # Trim to first 5 seconds
+        image = convert_image_mode(ImageAsset("cherry_blossom").pil_image.resize((128, 128)), "RGB")
+        if not VLLM_USE_MODELSCOPE:
+            video = VideoAsset(name="baby_reading", num_frames=4).np_ndarrays
+        else:
+            # modelscope can't access raushan-testing-hf/videos-test, skip video input temporarily
+            video = None
+
+        outputs = runner.generate_multimodal(
+            prompts=question,
+            audios=audio,
+            images=image,
+            videos=video,
+        )
+
+        # Find and verify text output (thinker stage)
+        text_output = None
+        output_count = 0
+        for stage_output in outputs:
+            if stage_output.final_output_type == "text":
+                text_output = stage_output
+                output_count += 1
+                break
+        assert output_count > 0
+
+        assert text_output is not None
+        assert len(text_output.request_output) > 0
+        text_content = text_output.request_output[0].outputs[0].text
+        assert text_content is not None
+        assert len(text_content.strip()) > 0
+
+        # Find and verify audio output (code2wav stage)
+        audio_output = None
+        output_count = 0
+        for stage_output in outputs:
+            if stage_output.final_output_type == "audio":
+                audio_output = stage_output
+                output_count += 1
+                break
+        assert output_count > 0
+
+        assert audio_output is not None
+        assert len(audio_output.request_output) > 0
+
+        # Verify audio tensor exists and has content
+        audio_tensor = audio_output.request_output[0].outputs[0].multimodal_output["audio"]
+        assert audio_tensor is not None
+        assert audio_tensor.numel() > 0
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("test_config", test_params)
+@create_new_process_for_each_test("spawn")
+def test_mixed_modalities_to_text_only(omni_runner: type[OmniRunner], test_config: tuple[str, str]) -> None:
+    """Test processing audio, image, and video together, generating audio output."""
+    model, stage_config_path = test_config
+    with omni_runner(model, seed=42, stage_configs_path=stage_config_path) as runner:
+        # Prepare multimodal inputs
+        question = "What is recited in the audio? What is in this image? Describe the video briefly."
+        audio = AudioAsset("mary_had_lamb").audio_and_sample_rate
+        audio = (audio[0][: 16000 * 5], audio[1])  # Trim to first 5 seconds
+        image = convert_image_mode(ImageAsset("cherry_blossom").pil_image.resize((128, 128)), "RGB")
+        video = VideoAsset(name="baby_reading", num_frames=4).np_ndarrays
+        modalities = ["text"]
+
+        outputs = runner.generate_multimodal(
+            prompts=question,
+            audios=audio,
+            images=image,
+            videos=video,
+            modalities=modalities,
+        )
+
+        # Find and verify text output (thinker stage)
+        text_output = None
+        output_count = 0
+        for stage_output in outputs:
+            assert stage_output.final_output_type != "audio"
+            if stage_output.final_output_type == "text":
+                text_output = stage_output
+                output_count += 1
+                break
+        assert output_count > 0
+
+        assert text_output is not None
+        assert len(text_output.request_output) > 0
+        text_content = text_output.request_output[0].outputs[0].text
+        assert text_content is not None
+        assert len(text_content.strip()) > 0
--- a/tests/e2e/offline_inference/test_qwen3_omni.py
+++ b/tests/e2e/offline_inference/test_qwen3_omni.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+E2E offline tests for Omni model with video input and audio output.
+"""
+
+import os
+
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0"
+
+from pathlib import Path
+
+import pytest
+from vllm.assets.video import VideoAsset
+
+from vllm_omni.platforms import current_omni_platform
+
+from .conftest import OmniRunner
+
+models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"]
+
+# CI stage config for 2xH100-80G GPUs or AMD GPU MI325
+if current_omni_platform.is_rocm():
+    # ROCm stage config optimized for MI325 GPU
+    stage_configs = [str(Path(__file__).parent / "stage_configs" / "rocm" / "qwen3_omni_ci.yaml")]
+else:
+    stage_configs = [str(Path(__file__).parent / "stage_configs" / "qwen3_omni_ci.yaml")]
+
+# Create parameter combinations for model and stage config
+test_params = [(model, stage_config) for model in models for stage_config in stage_configs]
+
+
+@pytest.mark.parametrize("test_config", test_params)
+def test_video_to_audio(omni_runner: type[OmniRunner], test_config) -> None:
+    """Test processing video, generating audio output."""
+    model, stage_config_path = test_config
+    with omni_runner(model, seed=42, stage_configs_path=stage_config_path, stage_init_timeout=300) as runner:
+        # Prepare inputs
+        question = "Describe the video briefly."
+        video = VideoAsset(name="baby_reading", num_frames=4).np_ndarrays
+
+        outputs = runner.generate_multimodal(
+            prompts=question,
+            videos=video,
+        )
+
+        # Find and verify text output (thinker stage)
+        text_output = None
+        output_count = 0
+        for stage_output in outputs:
+            if stage_output.final_output_type == "text":
+                text_output = stage_output
+                output_count += 1
+                break
+
+        assert output_count > 0
+        assert text_output is not None
+        assert len(text_output.request_output) > 0
+        text_content = text_output.request_output[0].outputs[0].text
+        assert text_content is not None
+        assert len(text_content.strip()) > 0
+
+        # Find and verify audio output (code2wav stage)
+        audio_output = None
+        output_count = 0
+        for stage_output in outputs:
+            if stage_output.final_output_type == "audio":
+                audio_output = stage_output
+                output_count += 1
+                break
+
+        assert output_count > 0
+        assert audio_output is not None
+        assert len(audio_output.request_output) > 0
+
+        # Verify audio tensor exists and has content
+        audio_tensor = audio_output.request_output[0].outputs[0].multimodal_output["audio"]
+        assert audio_tensor is not None
+        assert audio_tensor.numel() > 0
--- a/tests/e2e/offline_inference/test_sequence_parallel.py
+++ b/tests/e2e/offline_inference/test_sequence_parallel.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+System test for Sequence Parallel (SP) backends: Ulysses and Ring attention.
+
+Tests verify that SP inference produces correct outputs compared to baseline.
+"""
+
+import gc
+import os
+import sys
+import time
+from pathlib import Path
+from typing import NamedTuple
+
+import numpy as np
+import pytest
+import torch
+import torch.distributed as dist
+from PIL import Image
+
+from vllm_omni.inputs.data import OmniDiffusionSamplingParams
+
+# ruff: noqa: E402
+REPO_ROOT = Path(__file__).resolve().parents[3]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
+from vllm_omni import Omni
+from vllm_omni.diffusion.data import DiffusionParallelConfig
+from vllm_omni.platforms import current_omni_platform
+
+# Test configuration
+MODELS = ["riverclouds/qwen_image_random"]
+PROMPT = "a photo of a cat sitting on a laptop keyboard"
+DEFAULT_HEIGHT = 256
+DEFAULT_WIDTH = 256
+DEFAULT_SEED = 42
+DEFAULT_STEPS = 4
+DIFF_MEAN_THRESHOLD = 2e-2
+DIFF_MAX_THRESHOLD = 2e-1
+
+
+class InferenceResult(NamedTuple):
+    """Result of an inference run."""
+
+    images: list[Image.Image]
+    elapsed_ms: float
+
+
+def _cleanup_distributed():
+    """Clean up distributed environment and GPU resources."""
+    if dist.is_initialized():
+        dist.destroy_process_group()
+
+    for key in ["MASTER_ADDR", "MASTER_PORT", "RANK", "WORLD_SIZE", "LOCAL_RANK"]:
+        os.environ.pop(key, None)
+
+    gc.collect()
+    if current_omni_platform.is_available():
+        current_omni_platform.empty_cache()
+        current_omni_platform.synchronize()
+
+    time.sleep(5)
+
+
+def _diff_metrics(a: Image.Image, b: Image.Image) -> tuple[float, float]:
+    """Return (mean_abs_diff, max_abs_diff) over RGB pixels in [0, 1]."""
+    ta = torch.from_numpy(np.asarray(a.convert("RGB"), dtype=np.float32) / 255.0)
+    tb = torch.from_numpy(np.asarray(b.convert("RGB"), dtype=np.float32) / 255.0)
+    assert ta.shape == tb.shape, f"Image shapes differ: {ta.shape} vs {tb.shape}"
+    abs_diff = torch.abs(ta - tb)
+    return abs_diff.mean().item(), abs_diff.max().item()
+
+
+def _run_inference(
+    model_name: str,
+    dtype: torch.dtype,
+    attn_backend: str,
+    ulysses_degree: int = 1,
+    ring_degree: int = 1,
+    height: int = DEFAULT_HEIGHT,
+    width: int = DEFAULT_WIDTH,
+    seed: int = DEFAULT_SEED,
+    warmup: bool = True,
+) -> InferenceResult:
+    """Run inference with specified configuration.
+
+    Args:
+        warmup: If True, run one warmup iteration before the timed run.
+    """
+    parallel_config = DiffusionParallelConfig(ulysses_degree=ulysses_degree, ring_degree=ring_degree)
+    omni = Omni(
+        model=model_name,
+        parallel_config=parallel_config,
+        dtype=dtype,
+        attention_backend=attn_backend,
+    )
+
+    try:
+        # Warmup run (not timed)
+        if warmup:
+            _ = omni.generate(
+                PROMPT,
+                OmniDiffusionSamplingParams(
+                    height=height,
+                    width=width,
+                    num_inference_steps=DEFAULT_STEPS,
+                    guidance_scale=0.0,
+                    generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed + 1000),
+                    num_outputs_per_prompt=1,
+                ),
+            )
+
+        # Timed run
+        start = time.time()
+        outputs = omni.generate(
+            PROMPT,
+            OmniDiffusionSamplingParams(
+                height=height,
+                width=width,
+                num_inference_steps=DEFAULT_STEPS,
+                guidance_scale=0.0,
+                generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed),
+                num_outputs_per_prompt=1,
+            ),
+        )
+        elapsed_ms = (time.time() - start) * 1000
+
+        return InferenceResult(
+            images=outputs[0].request_output[0].images,
+            elapsed_ms=elapsed_ms,
+        )
+    finally:
+        omni.close()
+        _cleanup_distributed()
+
+
+# =============================================================================
+# Correctness & Performance Tests
+# =============================================================================
+
+# SP configurations: (ulysses_degree, ring_degree, height, width, warmup, is_perf_test)
+# - warmup: whether to run warmup for this SP config
+# - is_perf_test: whether this is a performance test (show speedup metrics)
+SP_CONFIGS = [
+    (2, 1, DEFAULT_HEIGHT, DEFAULT_WIDTH, True, True),  # Ulysses-2 - performance test
+    (1, 2, DEFAULT_HEIGHT, DEFAULT_WIDTH, True, True),  # Ring-2 - performance test
+    (2, 2, DEFAULT_HEIGHT, DEFAULT_WIDTH, False, False),  # Hybrid - correctness only
+    (4, 1, 272, 272, False, False),  # Ulysses-4 - shape and correctness
+]
+
+
+def _get_sp_mode(ulysses_degree: int, ring_degree: int) -> str:
+    """Get SP mode name for logging."""
+    if ulysses_degree > 1 and ring_degree == 1:
+        return f"ulysses-{ulysses_degree}"
+    elif ring_degree > 1 and ulysses_degree == 1:
+        return f"ring-{ring_degree}"
+    else:
+        return f"hybrid-{ulysses_degree}x{ring_degree}"
+
+
+@pytest.mark.parametrize("model_name", MODELS)
+def test_sp_correctness(model_name: str):
+    """Test that SP inference produces correct outputs and measure performance.
+
+    Runs baseline once per unique (height, width), then tests all SP configs.
+
+    Note: Run with `pytest -v -s` to see detailed output.
+    """
+    device_count = current_omni_platform.get_device_count()
+
+    # Cache baseline results by (height, width)
+    # Key: (height, width), Value: (result, warmup_used)
+    baseline_cache: dict[tuple[int, int], InferenceResult] = {}
+
+    # Collect results for summary
+    results: list[dict] = []
+
+    print("\n" + "=" * 70)
+    print(f"Sequence Parallel Test - Model: {model_name}")
+    print(f"Available GPUs: {device_count}")
+    print("=" * 70)
+
+    for ulysses_degree, ring_degree, height, width, sp_warmup, is_perf_test in SP_CONFIGS:
+        sp_size = ulysses_degree * ring_degree
+        sp_mode = _get_sp_mode(ulysses_degree, ring_degree)
+
+        if device_count < sp_size:
+            print(f"\n[{sp_mode}] SKIPPED (requires {sp_size} GPUs)")
+            continue
+
+        # Determine baseline warmup: only for default size (performance tests)
+        cache_key = (height, width)
+        baseline_warmup = height == DEFAULT_HEIGHT and width == DEFAULT_WIDTH
+
+        # Get or compute baseline for this (height, width)
+        if cache_key not in baseline_cache:
+            print(f"\n--- Running baseline {height}x{width} (warmup={baseline_warmup}) ---")
+            baseline = _run_inference(
+                model_name,
+                torch.bfloat16,
+                "sdpa",
+                height=height,
+                width=width,
+                warmup=baseline_warmup,
+            )
+            assert len(baseline.images) == 1
+            baseline_cache[cache_key] = baseline
+            print(f"[baseline] {height}x{width}: {baseline.elapsed_ms:.0f}ms")
+        else:
+            baseline = baseline_cache[cache_key]
+
+        # Run SP
+        print(f"\n--- Running {sp_mode} (warmup={sp_warmup}) ---")
+        sp_result = _run_inference(
+            model_name,
+            torch.bfloat16,
+            "sdpa",
+            ulysses_degree=ulysses_degree,
+            ring_degree=ring_degree,
+            height=height,
+            width=width,
+            warmup=sp_warmup,
+        )
+        assert len(sp_result.images) == 1
+
+        # Compare outputs (correctness)
+        mean_diff, max_diff = _diff_metrics(baseline.images[0], sp_result.images[0])
+
+        # Build result entry
+        result = {
+            "mode": sp_mode,
+            "sp_size": sp_size,
+            "height": height,
+            "width": width,
+            "baseline_ms": baseline.elapsed_ms,
+            "sp_ms": sp_result.elapsed_ms,
+            "mean_diff": mean_diff,
+            "max_diff": max_diff,
+            "is_perf_test": is_perf_test,
+        }
+        results.append(result)
+
+        # Output based on test type
+        if is_perf_test:
+            speedup = baseline.elapsed_ms / sp_result.elapsed_ms if sp_result.elapsed_ms > 0 else 0
+            result["speedup"] = speedup
+            print(
+                f"[{sp_mode}] {sp_size} GPUs | "
+                f"baseline: {baseline.elapsed_ms:.0f}ms, sp: {sp_result.elapsed_ms:.0f}ms, "
+                f"speedup: {speedup:.2f}x"
+            )
+        else:
+            print(f"[{sp_mode}] {sp_size} GPUs | sp: {sp_result.elapsed_ms:.0f}ms (correctness only)")
+
+        print(f"[{sp_mode}] diff: mean={mean_diff:.6e}, max={max_diff:.6e}")
+
+        # Assert correctness
+        assert mean_diff <= DIFF_MEAN_THRESHOLD and max_diff <= DIFF_MAX_THRESHOLD, (
+            f"[{sp_mode}] SP output differs from baseline: mean={mean_diff:.6e}, max={max_diff:.6e}"
+        )
+
+    # Summary
+    print("\n" + "=" * 70)
+    print("SUMMARY")
+    print("=" * 70)
+    print(f"{'Mode':<15} {'GPUs':<6} {'Size':<10} {'Baseline':<12} {'SP':<12} {'Speedup':<10} {'Status'}")
+    print("-" * 70)
+    for r in results:
+        speedup_str = f"{r['speedup']:.2f}x" if r.get("speedup") else "N/A"
+        baseline_str = f"{r['baseline_ms']:.0f}ms" if r["is_perf_test"] else "N/A"
+        status = "PASS" if r["mean_diff"] <= DIFF_MEAN_THRESHOLD else "FAIL"
+        print(
+            f"{r['mode']:<15} {r['sp_size']:<6} {r['height']}x{r['width']:<5} "
+            f"{baseline_str:<12} {r['sp_ms']:.0f}ms{'':<7} {speedup_str:<10} {status}"
+        )
+    print("=" * 70)
--- a/tests/e2e/offline_inference/test_stable_audio_model.py
+++ b/tests/e2e/offline_inference/test_stable_audio_model.py
+import sys
+from pathlib import Path
+
+import numpy as np
+import pytest
+import torch
+
+from vllm_omni.inputs.data import OmniDiffusionSamplingParams
+from vllm_omni.outputs import OmniRequestOutput
+
+# ruff: noqa: E402
+REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
+from vllm_omni import Omni
+
+# Use random weights model for CI testing (small, no authentication required)
+models = ["linyueqian/stable_audio_random"]
+
+
+@pytest.mark.parametrize("model_name", models)
+def test_stable_audio_model(model_name: str):
+    m = Omni(model=model_name)
+
+    # Use minimal settings for testing
+    # Generate a short 2-second audio clip with minimal inference steps
+    audio_start_in_s = 0.0
+    audio_end_in_s = 2.0  # Short duration for fast testing
+    sample_rate = 44100  # Stable Audio uses 44100 Hz
+
+    outputs = m.generate(
+        prompts={
+            "prompt": "The sound of a dog barking",
+            "negative_prompt": "Low quality.",
+        },
+        sampling_params_list=OmniDiffusionSamplingParams(
+            num_inference_steps=4,  # Minimal steps for speed
+            guidance_scale=7.0,
+            generator=torch.Generator("cuda").manual_seed(42),
+            num_outputs_per_prompt=1,
+            extra_args={
+                "audio_start_in_s": audio_start_in_s,
+                "audio_end_in_s": audio_end_in_s,
+            },
+        ),
+    )
+
+    # Extract audio from OmniRequestOutput
+    assert outputs is not None
+    first_output = outputs[0]
+    assert first_output.final_output_type == "image"
+    assert hasattr(first_output, "request_output") and first_output.request_output
+
+    req_out = first_output.request_output[0]
+    assert isinstance(req_out, OmniRequestOutput)
+    assert req_out.final_output_type == "audio"
+    assert hasattr(req_out, "multimodal_output") and req_out.multimodal_output
+    audio = req_out.multimodal_output.get("audio")
+    assert isinstance(audio, np.ndarray)
+    # audio shape: (batch, channels, samples)
+    # For stable-audio-open-1.0: sample_rate=44100, so 2 seconds = 88200 samples
+    assert audio.ndim == 3
+    assert audio.shape[0] == 1  # batch size
+    assert audio.shape[1] == 2  # stereo channels
+    expected_samples = int((audio_end_in_s - audio_start_in_s) * sample_rate)
+    assert audio.shape[2] == expected_samples  # 88200 samples for 2 seconds
--- a/tests/e2e/offline_inference/test_t2i_model.py
+++ b/tests/e2e/offline_inference/test_t2i_model.py
+import os
+import sys
+from pathlib import Path
+
+import pytest
+import torch
+
+from vllm_omni.inputs.data import OmniDiffusionSamplingParams
+from vllm_omni.outputs import OmniRequestOutput
+from vllm_omni.platforms import current_omni_platform
+
+# ruff: noqa: E402
+REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
+from vllm_omni import Omni
+
+os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1"
+
+
+models = ["Tongyi-MAI/Z-Image-Turbo", "riverclouds/qwen_image_random"]
+
+# Modelscope can't find riverclouds/qwen_image_random
+# TODO: When NPU support is ready, remove this branch.
+if current_omni_platform.is_npu():
+    models = ["Tongyi-MAI/Z-Image-Turbo", "Qwen/Qwen-Image"]
+elif current_omni_platform.is_rocm():
+    # TODO: When ROCm support is ready, remove this branch.
+    # vLLM V0.11.0 has issues running riverclouds/qwen_image_random
+    # on ROCm
+    models = ["Tongyi-MAI/Z-Image-Turbo"]
+
+
+@pytest.mark.parametrize("model_name", models)
+def test_diffusion_model(model_name: str):
+    m = None
+    try:
+        m = Omni(model=model_name)
+        # high resolution may cause OOM on L4
+        height = 256
+        width = 256
+        outputs = m.generate(
+            "a photo of a cat sitting on a laptop keyboard",
+            OmniDiffusionSamplingParams(
+                height=height,
+                width=width,
+                num_inference_steps=2,
+                guidance_scale=0.0,
+                generator=torch.Generator("cuda").manual_seed(42),
+                num_outputs_per_prompt=2,
+            ),
+        )
+        # Extract images from request_output[0]['images']
+        first_output = outputs[0]
+        assert first_output.final_output_type == "image"
+        if not hasattr(first_output, "request_output") or not first_output.request_output:
+            raise ValueError("No request_output found in OmniRequestOutput")
+
+        req_out = first_output.request_output[0]
+        if not isinstance(req_out, OmniRequestOutput) or not hasattr(req_out, "images"):
+            raise ValueError("Invalid request_output structure or missing 'images' key")
+
+        images = req_out.images
+
+        assert len(images) == 2
+        # check image size
+        assert images[0].width == width
+        assert images[0].height == height
+        images[0].save("image_output.png")
+    except Exception as e:
+        print(f"Test failed with error: {e}")
+        raise
+    finally:
+        if m is not None and hasattr(m, "close"):
+            m.close()