Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm-omni
Commits
c1cacde6
Commit
c1cacde6
authored
Mar 25, 2026
by
weishb
Browse files
vllm-omni_0.15.0.rc1+fix1 first commit
parent
35607782
Changes
306
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2670 additions
and
0 deletions
+2670
-0
tests/e2e/offline_inference/__init__.py
tests/e2e/offline_inference/__init__.py
+0
-0
tests/e2e/offline_inference/conftest.py
tests/e2e/offline_inference/conftest.py
+353
-0
tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml
...2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml
+85
-0
tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml
...ffline_inference/stage_configs/bagel_sharedmemory_ci.yaml
+83
-0
tests/e2e/offline_inference/stage_configs/npu/qwen2_5_omni_ci.yaml
.../offline_inference/stage_configs/npu/qwen2_5_omni_ci.yaml
+104
-0
tests/e2e/offline_inference/stage_configs/qwen2_5_omni_ci.yaml
.../e2e/offline_inference/stage_configs/qwen2_5_omni_ci.yaml
+106
-0
tests/e2e/offline_inference/stage_configs/qwen3_omni_ci.yaml
tests/e2e/offline_inference/stage_configs/qwen3_omni_ci.yaml
+99
-0
tests/e2e/offline_inference/stage_configs/rocm/qwen2_5_omni_ci.yaml
...offline_inference/stage_configs/rocm/qwen2_5_omni_ci.yaml
+105
-0
tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml
...e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml
+99
-0
tests/e2e/offline_inference/test_bagel_text2img.py
tests/e2e/offline_inference/test_bagel_text2img.py
+308
-0
tests/e2e/offline_inference/test_cache_dit.py
tests/e2e/offline_inference/test_cache_dit.py
+92
-0
tests/e2e/offline_inference/test_diffusion_cpu_offload.py
tests/e2e/offline_inference/test_diffusion_cpu_offload.py
+61
-0
tests/e2e/offline_inference/test_diffusion_layerwise_offload.py
...e2e/offline_inference/test_diffusion_layerwise_offload.py
+110
-0
tests/e2e/offline_inference/test_diffusion_lora.py
tests/e2e/offline_inference/test_diffusion_lora.py
+138
-0
tests/e2e/offline_inference/test_ovis_image.py
tests/e2e/offline_inference/test_ovis_image.py
+290
-0
tests/e2e/offline_inference/test_qwen2_5_omni.py
tests/e2e/offline_inference/test_qwen2_5_omni.py
+134
-0
tests/e2e/offline_inference/test_qwen3_omni.py
tests/e2e/offline_inference/test_qwen3_omni.py
+80
-0
tests/e2e/offline_inference/test_sequence_parallel.py
tests/e2e/offline_inference/test_sequence_parallel.py
+280
-0
tests/e2e/offline_inference/test_stable_audio_model.py
tests/e2e/offline_inference/test_stable_audio_model.py
+67
-0
tests/e2e/offline_inference/test_t2i_model.py
tests/e2e/offline_inference/test_t2i_model.py
+76
-0
No files found.
Too many changes to show.
To preserve performance only
306 of 306+
files are displayed.
Plain diff
Email patch
tests/e2e/offline_inference/__init__.py
0 → 100644
View file @
c1cacde6
tests/e2e/offline_inference/conftest.py
0 → 100644
View file @
c1cacde6
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Pytest configuration and fixtures for vllm-omni tests.
"""
from
typing
import
Any
import
pytest
from
vllm
import
TextPrompt
from
vllm.distributed.parallel_state
import
cleanup_dist_env_and_memory
from
tests.conftest
import
_run_post_test_cleanup
,
_run_pre_test_cleanup
from
vllm_omni.entrypoints.omni
import
Omni
from
vllm_omni.inputs.data
import
OmniSamplingParams
from
vllm_omni.outputs
import
OmniRequestOutput
PromptAudioInput
=
list
[
tuple
[
Any
,
int
]]
|
tuple
[
Any
,
int
]
|
None
PromptImageInput
=
list
[
Any
]
|
Any
|
None
PromptVideoInput
=
list
[
Any
]
|
Any
|
None
class
OmniRunner
:
"""
Test runner for Omni models.
"""
def
__init__
(
self
,
model_name
:
str
,
seed
:
int
=
42
,
stage_init_timeout
:
int
=
300
,
batch_timeout
:
int
=
10
,
init_timeout
:
int
=
300
,
shm_threshold_bytes
:
int
=
65536
,
log_stats
:
bool
=
False
,
stage_configs_path
:
str
|
None
=
None
,
**
kwargs
,
)
->
None
:
"""
Initialize an OmniRunner for testing.
Args:
model_name: The model name or path
seed: Random seed for reproducibility
stage_init_timeout: Timeout for initializing a single stage in seconds
batch_timeout: Timeout for batching in seconds
init_timeout: Timeout for initializing stages in seconds
shm_threshold_bytes: Threshold for using shared memory
log_stats: Enable detailed statistics logging
stage_configs_path: Optional path to YAML stage config file
**kwargs: Additional arguments passed to Omni
"""
cleanup_dist_env_and_memory
()
_run_pre_test_cleanup
(
enable_force
=
True
)
_run_post_test_cleanup
(
enable_force
=
True
)
self
.
model_name
=
model_name
self
.
seed
=
seed
self
.
omni
=
Omni
(
model
=
model_name
,
log_stats
=
log_stats
,
stage_init_timeout
=
stage_init_timeout
,
batch_timeout
=
batch_timeout
,
init_timeout
=
init_timeout
,
shm_threshold_bytes
=
shm_threshold_bytes
,
stage_configs_path
=
stage_configs_path
,
**
kwargs
,
)
def
get_default_sampling_params_list
(
self
)
->
list
[
OmniSamplingParams
]:
"""
Get a list of default sampling parameters for all stages.
Returns:
List of SamplingParams with default decoding for each stage
"""
return
[
st
.
default_sampling_params
for
st
in
self
.
omni
.
stage_list
]
def
get_omni_inputs
(
self
,
prompts
:
list
[
str
]
|
str
,
system_prompt
:
str
|
None
=
None
,
audios
:
PromptAudioInput
=
None
,
images
:
PromptImageInput
=
None
,
videos
:
PromptVideoInput
=
None
,
mm_processor_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
modalities
:
list
[
str
]
|
None
=
None
,
)
->
list
[
TextPrompt
]:
"""
Construct Omni input format from prompts and multimodal data.
Args:
prompts: Text prompt(s) - either a single string or list of strings
system_prompt: Optional system prompt (defaults to Qwen system prompt)
audios: Audio input(s) - tuple of (audio_array, sample_rate) or list of tuples
images: Image input(s) - PIL Image or list of PIL Images
videos: Video input(s) - numpy array or list of numpy arrays
mm_processor_kwargs: Optional processor kwargs (e.g., use_audio_in_video)
Returns:
List of prompt dictionaries suitable for Omni.generate()
"""
if
system_prompt
is
None
:
system_prompt
=
(
"You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
"Group, capable of perceiving auditory and visual inputs, as well as "
"generating text and speech."
)
video_padding_token
=
"<|VIDEO|>"
image_padding_token
=
"<|IMAGE|>"
audio_padding_token
=
"<|AUDIO|>"
if
self
.
model_name
==
"Qwen/Qwen3-Omni-30B-A3B-Instruct"
:
video_padding_token
=
"<|video_pad|>"
image_padding_token
=
"<|image_pad|>"
audio_padding_token
=
"<|audio_pad|>"
if
isinstance
(
prompts
,
str
):
prompts
=
[
prompts
]
def
_normalize_mm_input
(
mm_input
,
num_prompts
):
if
mm_input
is
None
:
return
[
None
]
*
num_prompts
if
isinstance
(
mm_input
,
list
):
if
len
(
mm_input
)
!=
num_prompts
:
raise
ValueError
(
f
"Multimodal input list length (
{
len
(
mm_input
)
}
) must match prompts length (
{
num_prompts
}
)"
)
return
mm_input
return
[
mm_input
]
*
num_prompts
num_prompts
=
len
(
prompts
)
audios_list
=
_normalize_mm_input
(
audios
,
num_prompts
)
images_list
=
_normalize_mm_input
(
images
,
num_prompts
)
videos_list
=
_normalize_mm_input
(
videos
,
num_prompts
)
omni_inputs
=
[]
for
i
,
prompt_text
in
enumerate
(
prompts
):
user_content
=
""
multi_modal_data
=
{}
audio
=
audios_list
[
i
]
if
audio
is
not
None
:
if
isinstance
(
audio
,
list
):
for
_
in
audio
:
user_content
+=
f
"<|audio_bos|>
{
audio_padding_token
}
<|audio_eos|>"
multi_modal_data
[
"audio"
]
=
audio
else
:
user_content
+=
f
"<|audio_bos|>
{
audio_padding_token
}
<|audio_eos|>"
multi_modal_data
[
"audio"
]
=
audio
image
=
images_list
[
i
]
if
image
is
not
None
:
if
isinstance
(
image
,
list
):
for
_
in
image
:
user_content
+=
f
"<|vision_bos|>
{
image_padding_token
}
<|vision_eos|>"
multi_modal_data
[
"image"
]
=
image
else
:
user_content
+=
f
"<|vision_bos|>
{
image_padding_token
}
<|vision_eos|>"
multi_modal_data
[
"image"
]
=
image
video
=
videos_list
[
i
]
if
video
is
not
None
:
if
isinstance
(
video
,
list
):
for
_
in
video
:
user_content
+=
f
"<|vision_bos|>
{
video_padding_token
}
<|vision_eos|>"
multi_modal_data
[
"video"
]
=
video
else
:
user_content
+=
f
"<|vision_bos|>
{
video_padding_token
}
<|vision_eos|>"
multi_modal_data
[
"video"
]
=
video
user_content
+=
prompt_text
full_prompt
=
(
f
"<|im_start|>system
\n
{
system_prompt
}
<|im_end|>
\n
"
f
"<|im_start|>user
\n
{
user_content
}
<|im_end|>
\n
"
f
"<|im_start|>assistant
\n
"
)
input_dict
:
TextPrompt
=
{
"prompt"
:
full_prompt
}
if
multi_modal_data
:
input_dict
[
"multi_modal_data"
]
=
multi_modal_data
if
modalities
:
input_dict
[
"modalities"
]
=
modalities
if
mm_processor_kwargs
:
input_dict
[
"mm_processor_kwargs"
]
=
mm_processor_kwargs
omni_inputs
.
append
(
input_dict
)
return
omni_inputs
def
generate
(
self
,
prompts
:
list
[
TextPrompt
],
sampling_params_list
:
list
[
OmniSamplingParams
]
|
None
=
None
,
)
->
list
[
OmniRequestOutput
]:
"""
Generate outputs for the given prompts.
Args:
prompts: List of prompt dictionaries with 'prompt' and optionally
'multi_modal_data' keys
sampling_params_list: List of sampling parameters for each stage.
If None, uses default parameters.
Returns:
List of OmniRequestOutput objects from stages with final_output=True
"""
if
sampling_params_list
is
None
:
sampling_params_list
=
self
.
get_default_sampling_params_list
()
return
self
.
omni
.
generate
(
prompts
,
sampling_params_list
)
def
generate_multimodal
(
self
,
prompts
:
list
[
str
]
|
str
,
sampling_params_list
:
list
[
OmniSamplingParams
]
|
None
=
None
,
system_prompt
:
str
|
None
=
None
,
audios
:
PromptAudioInput
=
None
,
images
:
PromptImageInput
=
None
,
videos
:
PromptVideoInput
=
None
,
mm_processor_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
modalities
:
list
[
str
]
|
None
=
None
,
)
->
list
[
OmniRequestOutput
]:
"""
Convenience method to generate with multimodal inputs.
Args:
prompts: Text prompt(s)
sampling_params_list: List of sampling parameters for each stage
system_prompt: Optional system prompt
audios: Audio input(s)
images: Image input(s)
videos: Video input(s)
mm_processor_kwargs: Optional processor kwargs
Returns:
List of OmniRequestOutput objects from stages with final_output=True
"""
omni_inputs
=
self
.
get_omni_inputs
(
prompts
=
prompts
,
system_prompt
=
system_prompt
,
audios
=
audios
,
images
=
images
,
videos
=
videos
,
mm_processor_kwargs
=
mm_processor_kwargs
,
modalities
=
modalities
,
)
return
self
.
generate
(
omni_inputs
,
sampling_params_list
)
def
generate_audio
(
self
,
prompts
:
list
[
str
]
|
str
,
sampling_params_list
:
list
[
OmniSamplingParams
]
|
None
=
None
,
system_prompt
:
str
|
None
=
None
,
audios
:
PromptAudioInput
=
None
,
mm_processor_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
list
[
OmniRequestOutput
]:
"""
Convenience method to generate with multimodal inputs.
Args:
prompts: Text prompt(s)
sampling_params_list: List of sampling parameters for each stage
system_prompt: Optional system prompt
audios: Audio input(s)
mm_processor_kwargs: Optional processor kwargs
Returns:
List of OmniRequestOutput objects from stages with final_output=True
"""
omni_inputs
=
self
.
get_omni_inputs
(
prompts
=
prompts
,
system_prompt
=
system_prompt
,
audios
=
audios
,
mm_processor_kwargs
=
mm_processor_kwargs
,
)
return
self
.
generate
(
omni_inputs
,
sampling_params_list
)
def
generate_video
(
self
,
prompts
:
list
[
str
]
|
str
,
sampling_params_list
:
list
[
OmniSamplingParams
]
|
None
=
None
,
system_prompt
:
str
|
None
=
None
,
videos
:
PromptVideoInput
=
None
,
mm_processor_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
list
[
OmniRequestOutput
]:
"""
Convenience method to generate with multimodal inputs.
Args:
prompts: Text prompt(s)
sampling_params_list: List of sampling parameters for each stage
system_prompt: Optional system prompt
videos: Video input(s)
mm_processor_kwargs: Optional processor kwargs
Returns:
List of OmniRequestOutput objects from stages with final_output=True
"""
omni_inputs
=
self
.
get_omni_inputs
(
prompts
=
prompts
,
system_prompt
=
system_prompt
,
videos
=
videos
,
mm_processor_kwargs
=
mm_processor_kwargs
,
)
return
self
.
generate
(
omni_inputs
,
sampling_params_list
)
def
generate_image
(
self
,
prompts
:
list
[
str
]
|
str
,
sampling_params_list
:
list
[
OmniSamplingParams
]
|
None
=
None
,
system_prompt
:
str
|
None
=
None
,
images
:
PromptImageInput
=
None
,
mm_processor_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
list
[
OmniRequestOutput
]:
"""
Convenience method to generate with multimodal inputs.
Args:
prompts: Text prompt(s)
sampling_params_list: List of sampling parameters for each stage
system_prompt: Optional system prompt
images: Image input(s)
mm_processor_kwargs: Optional processor kwargs
Returns:
List of OmniRequestOutput objects from stages with final_output=True
"""
omni_inputs
=
self
.
get_omni_inputs
(
prompts
=
prompts
,
system_prompt
=
system_prompt
,
images
=
images
,
mm_processor_kwargs
=
mm_processor_kwargs
,
)
return
self
.
generate
(
omni_inputs
,
sampling_params_list
)
def
__enter__
(
self
):
"""Context manager entry."""
return
self
def
__exit__
(
self
,
exc_type
,
exc_val
,
exc_tb
):
"""Context manager exit - cleanup resources."""
self
.
close
()
del
self
.
omni
cleanup_dist_env_and_memory
()
_run_post_test_cleanup
(
enable_force
=
True
)
def
close
(
self
):
"""Close and cleanup the Omni instance."""
if
hasattr
(
self
.
omni
,
"close"
):
self
.
omni
.
close
()
@
pytest
.
fixture
(
scope
=
"session"
)
def
omni_runner
():
return
OmniRunner
tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml
0 → 100644
View file @
c1cacde6
# stage config for running BAGEL with Mooncake connector for CI e2e tests.
# This config is optimized for single GPU tests with Mooncake inter-stage communication.
stage_args
:
-
stage_id
:
0
stage_type
:
llm
runtime
:
devices
:
"
0"
max_batch_size
:
1
engine_args
:
model_stage
:
thinker
model_arch
:
BagelForConditionalGeneration
worker_type
:
ar
scheduler_cls
:
vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
gpu_memory_utilization
:
0.35
enforce_eager
:
true
trust_remote_code
:
true
engine_output_type
:
text
distributed_executor_backend
:
mp
enable_prefix_caching
:
false
max_num_batched_tokens
:
32768
tensor_parallel_size
:
1
omni_kv_config
:
need_send_cache
:
true
kv_transfer_criteria
:
type
:
prefill_finished
final_output
:
true
final_output_type
:
text
is_comprehension
:
true
default_sampling_params
:
temperature
:
0.4
top_p
:
0.9
top_k
:
1
max_tokens
:
2048
seed
:
52
detokenize
:
true
repetition_penalty
:
1.05
output_connectors
:
to_stage_1
:
mooncake_connector
-
stage_id
:
1
stage_type
:
diffusion
runtime
:
devices
:
"
0"
max_batch_size
:
1
engine_args
:
model_stage
:
dit
gpu_memory_utilization
:
0.55
enforce_eager
:
true
trust_remote_code
:
true
engine_output_type
:
image
distributed_executor_backend
:
mp
enable_prefix_caching
:
false
max_num_batched_tokens
:
32768
tensor_parallel_size
:
1
omni_kv_config
:
need_recv_cache
:
true
engine_input_source
:
[
0
]
final_output
:
true
final_output_type
:
image
is_comprehension
:
false
default_sampling_params
:
seed
:
52
input_connectors
:
from_stage_0
:
mooncake_connector
# Top-level runtime config with Mooncake connector
runtime
:
enabled
:
true
defaults
:
window_size
:
-1
max_inflight
:
1
connectors
:
mooncake_connector
:
name
:
MooncakeConnector
extra
:
host
:
"
${MOONCAKE_HOST}"
metadata_server
:
"
http://${MOONCAKE_HOST}:${MOONCAKE_HTTP_PORT}/metadata"
master
:
"
${MOONCAKE_HOST}:${MOONCAKE_RPC_PORT}"
segment
:
64000000
localbuf
:
64000000
proto
:
tcp
edges
:
-
from
:
0
to
:
1
window_size
:
-1
tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml
0 → 100644
View file @
c1cacde6
# stage config for running BAGEL with SharedMemory connector for CI e2e tests.
# This config is optimized for single GPU tests with SharedMemory inter-stage communication.
stage_args
:
-
stage_id
:
0
stage_type
:
llm
runtime
:
devices
:
"
0"
max_batch_size
:
1
engine_args
:
model_stage
:
thinker
model_arch
:
BagelForConditionalGeneration
worker_type
:
ar
scheduler_cls
:
vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
gpu_memory_utilization
:
0.35
enforce_eager
:
true
trust_remote_code
:
true
engine_output_type
:
text
distributed_executor_backend
:
"
mp"
enable_prefix_caching
:
false
max_num_batched_tokens
:
32768
tensor_parallel_size
:
1
omni_kv_config
:
need_send_cache
:
true
kv_transfer_criteria
:
type
:
prefill_finished
#or special token generated
final_output
:
true
final_output_type
:
text
is_comprehension
:
true
default_sampling_params
:
temperature
:
0.4
top_p
:
0.9
top_k
:
1
max_tokens
:
2048
seed
:
52
detokenize
:
True
repetition_penalty
:
1.05
-
stage_id
:
1
stage_type
:
diffusion
runtime
:
devices
:
"
0"
max_batch_size
:
1
engine_args
:
model_stage
:
dit
gpu_memory_utilization
:
0.55
enforce_eager
:
true
trust_remote_code
:
true
engine_output_type
:
image
distributed_executor_backend
:
"
mp"
enable_prefix_caching
:
false
max_num_batched_tokens
:
32768
tensor_parallel_size
:
1
omni_kv_config
:
need_recv_cache
:
true
engine_input_source
:
[
0
]
final_output
:
true
final_output_type
:
image
is_comprehension
:
false
default_sampling_params
:
seed
:
52
# Runtime edges
runtime
:
enabled
:
true
defaults
:
window_size
:
-1
max_inflight
:
1
# Distributed connectors configuration (optional)
# More connectors will be supported in the future.
connectors
:
shared_memory_connector
:
name
:
SharedMemoryConnector
extra
:
shm_threshold_bytes
:
65536
# 64KB threshold
edges
:
-
from
:
0
to
:
1
window_size
:
-1
tests/e2e/offline_inference/stage_configs/npu/qwen2_5_omni_ci.yaml
0 → 100644
View file @
c1cacde6
# stage config for running qwen2.5-omni with architecture of OmniLLM.
# This config is optimized for CI e2e tests.
stage_args
:
-
stage_id
:
0
runtime
:
process
:
true
# Run this stage in a separate process
devices
:
"
0"
max_batch_size
:
1
engine_args
:
model_stage
:
thinker
model_arch
:
Qwen2_5OmniForConditionalGeneration
worker_type
:
ar
scheduler_cls
:
vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
max_model_len
:
896
max_num_batched_tokens
:
896
max_num_seqs
:
1
gpu_memory_utilization
:
0.8
skip_mm_profiling
:
true
enforce_eager
:
true
# Now we only support eager mode
trust_remote_code
:
true
engine_output_type
:
latent
enable_prefix_caching
:
false
is_comprehension
:
true
final_output
:
true
final_output_type
:
text
default_sampling_params
:
temperature
:
0.0
top_p
:
1.0
top_k
:
-1
max_tokens
:
128
seed
:
42
detokenize
:
True
repetition_penalty
:
1.1
-
stage_id
:
1
runtime
:
process
:
true
devices
:
"
1"
max_batch_size
:
1
engine_args
:
model_stage
:
talker
model_arch
:
Qwen2_5OmniForConditionalGeneration
worker_type
:
ar
scheduler_cls
:
vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
max_model_len
:
896
max_num_batched_tokens
:
896
max_num_seqs
:
1
gpu_memory_utilization
:
0.8
skip_mm_profiling
:
true
enforce_eager
:
true
trust_remote_code
:
true
enable_prefix_caching
:
false
engine_output_type
:
latent
engine_input_source
:
[
0
]
custom_process_input_func
:
vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker
default_sampling_params
:
temperature
:
0.9
top_p
:
0.8
top_k
:
40
max_tokens
:
128
seed
:
42
detokenize
:
True
repetition_penalty
:
1.05
stop_token_ids
:
[
8294
]
-
stage_id
:
2
runtime
:
process
:
true
devices
:
"
0"
# Example: use a different GPU than the previous stage; use "0" if single GPU
max_batch_size
:
1
engine_args
:
model_stage
:
code2wav
model_arch
:
Qwen2_5OmniForConditionalGeneration
worker_type
:
generation
scheduler_cls
:
vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
gpu_memory_utilization
:
0.15
enforce_eager
:
true
trust_remote_code
:
true
enable_prefix_caching
:
false
engine_output_type
:
audio
engine_input_source
:
[
1
]
final_output
:
true
final_output_type
:
audio
default_sampling_params
:
temperature
:
0.0
top_p
:
1.0
top_k
:
-1
max_tokens
:
128
seed
:
42
detokenize
:
True
repetition_penalty
:
1.1
# Top-level runtime config (concise): default windows and stage edges
runtime
:
enabled
:
true
defaults
:
window_size
:
-1
# Simplified: trigger downstream only after full upstream completion
max_inflight
:
1
# Simplified: process serially within each stage
edges
:
-
from
:
0
# thinker → talker: trigger only after receiving full input (-1)
to
:
1
window_size
:
-1
-
from
:
1
# talker → code2wav: trigger only after receiving full input (-1)
to
:
2
window_size
:
-1
tests/e2e/offline_inference/stage_configs/qwen2_5_omni_ci.yaml
0 → 100644
View file @
c1cacde6
# stage config for running qwen2.5-omni with architecture of OmniLLM.
# The following config has been verified on 2x 24GB GPU (L4/RTX3090/RTX4090).
# This config is optimized for CI e2e tests.
stage_args
:
-
stage_id
:
0
runtime
:
process
:
true
# Run this stage in a separate process
devices
:
"
0"
# Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
max_batch_size
:
1
engine_args
:
model_stage
:
thinker
model_arch
:
Qwen2_5OmniForConditionalGeneration
worker_type
:
ar
scheduler_cls
:
vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
max_model_len
:
896
max_num_batched_tokens
:
896
max_num_seqs
:
1
gpu_memory_utilization
:
0.8
skip_mm_profiling
:
true
enforce_eager
:
true
# Now we only support eager mode
trust_remote_code
:
true
engine_output_type
:
latent
enable_prefix_caching
:
false
is_comprehension
:
true
final_output
:
true
final_output_type
:
text
default_sampling_params
:
temperature
:
0.0
top_p
:
1.0
top_k
:
-1
max_tokens
:
128
seed
:
42
detokenize
:
True
repetition_penalty
:
1.1
-
stage_id
:
1
runtime
:
process
:
true
devices
:
"
1"
max_batch_size
:
1
engine_args
:
model_stage
:
talker
model_arch
:
Qwen2_5OmniForConditionalGeneration
worker_type
:
ar
scheduler_cls
:
vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
max_model_len
:
896
max_num_batched_tokens
:
896
max_num_seqs
:
1
gpu_memory_utilization
:
0.8
skip_mm_profiling
:
true
enforce_eager
:
true
trust_remote_code
:
true
enable_prefix_caching
:
false
engine_output_type
:
latent
engine_input_source
:
[
0
]
custom_process_input_func
:
vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker
default_sampling_params
:
temperature
:
0.9
top_p
:
0.8
top_k
:
40
max_tokens
:
128
seed
:
42
detokenize
:
True
repetition_penalty
:
1.05
stop_token_ids
:
[
8294
]
-
stage_id
:
2
runtime
:
process
:
true
devices
:
"
0"
# Example: use a different GPU than the previous stage; use "0" if single GPU
max_batch_size
:
1
engine_args
:
model_stage
:
code2wav
model_arch
:
Qwen2_5OmniForConditionalGeneration
worker_type
:
generation
scheduler_cls
:
vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
gpu_memory_utilization
:
0.15
enforce_eager
:
true
trust_remote_code
:
true
enable_prefix_caching
:
false
engine_output_type
:
audio
max_num_batched_tokens
:
4069
engine_input_source
:
[
1
]
final_output
:
true
final_output_type
:
audio
default_sampling_params
:
temperature
:
0.0
top_p
:
1.0
top_k
:
-1
max_tokens
:
128
seed
:
42
detokenize
:
True
repetition_penalty
:
1.1
# Top-level runtime config (concise): default windows and stage edges
runtime
:
enabled
:
true
defaults
:
window_size
:
-1
# Simplified: trigger downstream only after full upstream completion
max_inflight
:
1
# Simplified: process serially within each stage
edges
:
-
from
:
0
# thinker → talker: trigger only after receiving full input (-1)
to
:
1
window_size
:
-1
-
from
:
1
# talker → code2wav: trigger only after receiving full input (-1)
to
:
2
window_size
:
-1
tests/e2e/offline_inference/stage_configs/qwen3_omni_ci.yaml
0 → 100644
View file @
c1cacde6
# Stage config for running Qwen3-Omni-MoE with 3-stage architecture
# Stage 0: Thinker (multimodal understanding + text generation)
# Stage 1: Talker (text embeddings → 16-layer RVQ codec codes)
# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform)
# The following config has been verified on 2x H100-80G GPUs.
stage_args
:
-
stage_id
:
0
runtime
:
devices
:
"
0"
max_batch_size
:
1
engine_args
:
model_stage
:
thinker
model_arch
:
Qwen3OmniMoeForConditionalGeneration
worker_type
:
ar
scheduler_cls
:
vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
gpu_memory_utilization
:
0.9
enforce_eager
:
false
trust_remote_code
:
true
engine_output_type
:
latent
# Output hidden states for talker
distributed_executor_backend
:
"
mp"
enable_prefix_caching
:
false
hf_config_name
:
thinker_config
tensor_parallel_size
:
1
load_format
:
dummy
final_output
:
true
final_output_type
:
text
is_comprehension
:
true
default_sampling_params
:
temperature
:
0.4
top_p
:
0.9
top_k
:
1
max_tokens
:
100
seed
:
42
detokenize
:
True
repetition_penalty
:
1.05
-
stage_id
:
1
runtime
:
devices
:
"
1"
max_batch_size
:
1
engine_args
:
model_stage
:
talker
model_arch
:
Qwen3OmniMoeForConditionalGeneration
worker_type
:
ar
scheduler_cls
:
vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
gpu_memory_utilization
:
0.6
enforce_eager
:
true
trust_remote_code
:
true
engine_output_type
:
latent
# Output codec codes for code2wav
# tensor_parallel_size: 2
enable_prefix_caching
:
false
distributed_executor_backend
:
"
mp"
hf_config_name
:
talker_config
load_format
:
dummy
engine_input_source
:
[
0
]
custom_process_input_func
:
vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker
# final_output: true
# final_output_type: text
default_sampling_params
:
temperature
:
0.9
top_k
:
50
max_tokens
:
100
seed
:
42
detokenize
:
False
repetition_penalty
:
1.05
stop_token_ids
:
[
2150
]
-
stage_id
:
2
runtime
:
devices
:
"
1"
max_batch_size
:
1
engine_args
:
model_stage
:
code2wav
model_arch
:
Qwen3OmniMoeForConditionalGeneration
worker_type
:
generation
scheduler_cls
:
vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
enforce_eager
:
true
trust_remote_code
:
true
enable_prefix_caching
:
false
engine_output_type
:
audio
# Final output: audio waveform
gpu_memory_utilization
:
0.1
distributed_executor_backend
:
"
mp"
max_num_batched_tokens
:
1000000
hf_config_name
:
thinker_config
load_format
:
dummy
async_scheduling
:
false
engine_input_source
:
[
1
]
custom_process_input_func
:
vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav
final_output
:
true
final_output_type
:
audio
default_sampling_params
:
temperature
:
0.0
top_p
:
1.0
top_k
:
-1
max_tokens
:
200
seed
:
42
detokenize
:
True
repetition_penalty
:
1.1
tests/e2e/offline_inference/stage_configs/rocm/qwen2_5_omni_ci.yaml
0 → 100644
View file @
c1cacde6
# stage config for running qwen2.5-omni with architecture of OmniLLM.
# The following config has been verified on 2x 24GB GPU (L4/RTX3090/RTX4090).
# This config is optimized for CI e2e tests.
stage_args
:
-
stage_id
:
0
runtime
:
process
:
true
# Run this stage in a separate process
devices
:
"
0"
# Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
max_batch_size
:
1
engine_args
:
model_stage
:
thinker
model_arch
:
Qwen2_5OmniForConditionalGeneration
worker_type
:
ar
scheduler_cls
:
vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
max_model_len
:
896
max_num_batched_tokens
:
896
max_num_seqs
:
1
gpu_memory_utilization
:
0.8
skip_mm_profiling
:
true
enforce_eager
:
true
# Now we only support eager mode
trust_remote_code
:
true
engine_output_type
:
latent
enable_prefix_caching
:
false
is_comprehension
:
true
final_output
:
true
final_output_type
:
text
default_sampling_params
:
temperature
:
0.0
top_p
:
1.0
top_k
:
-1
max_tokens
:
128
seed
:
42
detokenize
:
True
repetition_penalty
:
1.1
-
stage_id
:
1
runtime
:
process
:
true
devices
:
"
1"
max_batch_size
:
1
engine_args
:
model_stage
:
talker
model_arch
:
Qwen2_5OmniForConditionalGeneration
worker_type
:
ar
scheduler_cls
:
vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
max_model_len
:
896
max_num_batched_tokens
:
896
max_num_seqs
:
1
gpu_memory_utilization
:
0.8
skip_mm_profiling
:
true
enforce_eager
:
true
trust_remote_code
:
true
enable_prefix_caching
:
false
engine_output_type
:
latent
engine_input_source
:
[
0
]
custom_process_input_func
:
vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker
default_sampling_params
:
temperature
:
0.9
top_p
:
0.8
top_k
:
40
max_tokens
:
128
seed
:
42
detokenize
:
True
repetition_penalty
:
1.05
stop_token_ids
:
[
8294
]
-
stage_id
:
2
runtime
:
process
:
true
devices
:
"
0"
# Example: use a different GPU than the previous stage; use "0" if single GPU
max_batch_size
:
1
engine_args
:
model_stage
:
code2wav
model_arch
:
Qwen2_5OmniForConditionalGeneration
worker_type
:
generation
scheduler_cls
:
vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
gpu_memory_utilization
:
0.15
enforce_eager
:
true
trust_remote_code
:
true
enable_prefix_caching
:
false
engine_output_type
:
audio
engine_input_source
:
[
1
]
final_output
:
true
final_output_type
:
audio
default_sampling_params
:
temperature
:
0.0
top_p
:
1.0
top_k
:
-1
max_tokens
:
128
seed
:
42
detokenize
:
True
repetition_penalty
:
1.1
# Top-level runtime config (concise): default windows and stage edges
runtime
:
enabled
:
true
defaults
:
window_size
:
-1
# Simplified: trigger downstream only after full upstream completion
max_inflight
:
1
# Simplified: process serially within each stage
edges
:
-
from
:
0
# thinker → talker: trigger only after receiving full input (-1)
to
:
1
window_size
:
-1
-
from
:
1
# talker → code2wav: trigger only after receiving full input (-1)
to
:
2
window_size
:
-1
tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml
0 → 100644
View file @
c1cacde6
# Stage config for running Qwen3-Omni-MoE with 3-stage architecture
# Stage 0: Thinker (multimodal understanding + text generation)
# Stage 1: Talker (text embeddings → 16-layer RVQ codec codes)
# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform)
# The following config has been verified on 2x H100-80G GPUs.
stage_args
:
-
stage_id
:
0
runtime
:
devices
:
"
0"
max_batch_size
:
1
engine_args
:
model_stage
:
thinker
model_arch
:
Qwen3OmniMoeForConditionalGeneration
worker_type
:
ar
scheduler_cls
:
vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
gpu_memory_utilization
:
0.9
enforce_eager
:
false
trust_remote_code
:
true
engine_output_type
:
latent
# Output hidden states for talker
distributed_executor_backend
:
"
mp"
enable_prefix_caching
:
false
hf_config_name
:
thinker_config
tensor_parallel_size
:
1
load_format
:
dummy
final_output
:
true
final_output_type
:
text
is_comprehension
:
true
default_sampling_params
:
temperature
:
0.4
top_p
:
0.9
top_k
:
1
max_tokens
:
100
seed
:
42
detokenize
:
True
repetition_penalty
:
1.05
-
stage_id
:
1
runtime
:
devices
:
"
1"
max_batch_size
:
1
engine_args
:
model_stage
:
talker
model_arch
:
Qwen3OmniMoeForConditionalGeneration
worker_type
:
ar
scheduler_cls
:
vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
gpu_memory_utilization
:
0.6
enforce_eager
:
true
trust_remote_code
:
true
engine_output_type
:
latent
# Output codec codes for code2wav
# tensor_parallel_size: 2
enable_prefix_caching
:
false
distributed_executor_backend
:
"
mp"
hf_config_name
:
talker_config
load_format
:
dummy
engine_input_source
:
[
0
]
custom_process_input_func
:
vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker
# final_output: true
# final_output_type: text
default_sampling_params
:
temperature
:
0.9
top_k
:
50
max_tokens
:
100
seed
:
42
detokenize
:
False
repetition_penalty
:
1.05
stop_token_ids
:
[
2150
]
-
stage_id
:
2
runtime
:
devices
:
"
1"
max_batch_size
:
1
engine_args
:
model_stage
:
code2wav
model_arch
:
Qwen3OmniMoeForConditionalGeneration
worker_type
:
generation
scheduler_cls
:
vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
enforce_eager
:
true
trust_remote_code
:
true
enable_prefix_caching
:
false
engine_output_type
:
audio
# Final output: audio waveform
gpu_memory_utilization
:
0.1
distributed_executor_backend
:
"
mp"
max_num_batched_tokens
:
1000000
hf_config_name
:
thinker_config
load_format
:
dummy
async_scheduling
:
false
engine_input_source
:
[
1
]
custom_process_input_func
:
vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav
final_output
:
true
final_output_type
:
audio
default_sampling_params
:
temperature
:
0.0
top_p
:
1.0
top_k
:
-1
max_tokens
:
200
seed
:
42
detokenize
:
True
repetition_penalty
:
1.1
tests/e2e/offline_inference/test_bagel_text2img.py
0 → 100644
View file @
c1cacde6
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
End-to-end test for Bagel text2img generation.
This test validates that the Bagel model generates images that match
expected reference pixel values within a ±5 tolerance.
Equivalent to running:
python3 examples/offline_inference/bagel/end2end.py
\
--prompts "A futuristic city skyline at twilight, cyberpunk style"
\
--modality text2img --step 15
"""
import
os
os
.
environ
[
"VLLM_WORKER_MULTIPROC_METHOD"
]
=
"spawn"
os
.
environ
[
"VLLM_TEST_CLEAN_GPU_MEMORY"
]
=
"1"
import
signal
import
socket
import
subprocess
import
tempfile
import
time
from
pathlib
import
Path
from
typing
import
Any
import
pytest
from
PIL
import
Image
from
tests.utils
import
hardware_test
from
vllm_omni.entrypoints.omni
import
Omni
# Reference pixel data extracted from the known-good output image
# Each entry contains (x, y) position and expected (R, G, B) values
# "Generated with seed=52, num_inference_steps=15,
# prompt='A futuristic city skyline at twilight, cyberpunk style'"
REFERENCE_PIXELS
=
[
{
"position"
:
(
100
,
100
),
"rgb"
:
(
68
,
107
,
134
)},
{
"position"
:
(
400
,
50
),
"rgb"
:
(
95
,
139
,
166
)},
{
"position"
:
(
700
,
100
),
"rgb"
:
(
99
,
122
,
151
)},
{
"position"
:
(
150
,
400
),
"rgb"
:
(
111
,
125
,
153
)},
{
"position"
:
(
512
,
512
),
"rgb"
:
(
97
,
107
,
131
)},
{
"position"
:
(
700
,
400
),
"rgb"
:
(
48
,
64
,
98
)},
{
"position"
:
(
100
,
700
),
"rgb"
:
(
79
,
63
,
84
)},
{
"position"
:
(
400
,
700
),
"rgb"
:
(
40
,
58
,
79
)},
{
"position"
:
(
700
,
700
),
"rgb"
:
(
60
,
75
,
103
)},
{
"position"
:
(
256
,
256
),
"rgb"
:
(
97
,
128
,
156
)},
]
# Maximum allowed difference per color channel
PIXEL_TOLERANCE
=
5
# Default test prompt
DEFAULT_PROMPT
=
"<|im_start|>A futuristic city skyline at twilight, cyberpunk style<|im_end|>"
def
_find_free_port
()
->
int
:
"""Find and return a free ephemeral port by binding to port 0."""
with
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
)
as
s
:
s
.
bind
((
"127.0.0.1"
,
0
))
s
.
listen
(
1
)
port
=
s
.
getsockname
()[
1
]
return
port
def
_configure_sampling_params
(
omni
:
Omni
,
max_tokens
:
int
=
1
,
num_inference_steps
:
int
=
15
)
->
list
:
"""Configure sampling parameters for Bagel text2img generation.
Args:
omni: The Omni instance to get default params from.
max_tokens: Maximum tokens for the first stage.
num_inference_steps: Number of inference steps for the diffusion stage.
Returns:
Configured sampling params list.
"""
params_list
=
omni
.
default_sampling_params_list
params_list
[
0
].
max_tokens
=
max_tokens
# type: ignore
if
len
(
params_list
)
>
1
:
params_list
[
1
].
num_inference_steps
=
num_inference_steps
# type: ignore
return
params_list
def
_extract_generated_image
(
omni_outputs
:
list
)
->
Image
.
Image
|
None
:
"""Extract the generated image from Omni outputs.
Args:
omni_outputs: List of outputs from omni.generate().
Returns:
The first generated PIL Image, or None if no image found.
"""
for
req_output
in
omni_outputs
:
if
images
:
=
getattr
(
req_output
,
"images"
,
None
):
return
images
[
0
]
if
hasattr
(
req_output
,
"request_output"
)
and
req_output
.
request_output
:
for
stage_out
in
req_output
.
request_output
:
if
hasattr
(
stage_out
,
"images"
)
and
stage_out
.
images
:
return
stage_out
.
images
[
0
]
return
None
def
_validate_pixels
(
image
:
Image
.
Image
,
reference_pixels
:
list
[
dict
[
str
,
Any
]]
=
REFERENCE_PIXELS
,
tolerance
:
int
=
PIXEL_TOLERANCE
,
)
->
None
:
"""Validate that image pixels match expected reference values.
Args:
image: The PIL Image to validate.
reference_pixels: List of dicts with 'position' (x, y) and 'rgb' (R, G, B).
tolerance: Maximum allowed difference per color channel.
Raises:
AssertionError: If any pixel differs beyond tolerance.
"""
for
ref
in
reference_pixels
:
x
,
y
=
ref
[
"position"
]
expected
=
ref
[
"rgb"
]
actual
=
image
.
getpixel
((
x
,
y
))[:
3
]
assert
all
(
abs
(
a
-
e
)
<=
tolerance
for
a
,
e
in
zip
(
actual
,
expected
)),
(
f
"Pixel mismatch at (
{
x
}
,
{
y
}
): expected
{
expected
}
, got
{
actual
}
"
)
def
_generate_bagel_image
(
omni
:
Omni
,
prompt
:
str
=
DEFAULT_PROMPT
)
->
Image
.
Image
:
"""Generate an image using Bagel model with configured parameters.
Args:
omni: The Omni instance to use for generation.
prompt: The text prompt for image generation.
Returns:
The generated PIL Image.
Raises:
AssertionError: If no image is generated or size is incorrect.
"""
params_list
=
_configure_sampling_params
(
omni
)
omni_outputs
=
list
(
omni
.
generate
(
prompts
=
[{
"prompt"
:
prompt
,
"modalities"
:
[
"image"
]}],
sampling_params_list
=
params_list
,
)
)
generated_image
=
_extract_generated_image
(
omni_outputs
)
assert
generated_image
is
not
None
,
"No images generated"
assert
generated_image
.
size
==
(
1024
,
1024
),
f
"Expected 1024x1024, got
{
generated_image
.
size
}
"
return
generated_image
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
diffusion
@
hardware_test
(
res
=
{
"cuda"
:
"H100"
})
def
test_bagel_text2img_shared_memory_connector
():
"""Test Bagel text2img with shared memory connector."""
config_path
=
str
(
Path
(
__file__
).
parent
/
"stage_configs"
/
"bagel_sharedmemory_ci.yaml"
)
omni
=
Omni
(
model
=
"ByteDance-Seed/BAGEL-7B-MoT"
,
stage_configs_path
=
config_path
,
stage_init_timeout
=
300
)
try
:
generated_image
=
_generate_bagel_image
(
omni
)
_validate_pixels
(
generated_image
)
finally
:
omni
.
close
()
def
_wait_for_port
(
host
:
str
,
port
:
int
,
timeout
:
int
=
30
)
->
bool
:
"""Wait for a port to become available.
Args:
host: The host address.
port: The port number.
timeout: Maximum seconds to wait.
Returns:
True if port becomes available, False otherwise.
"""
for
_
in
range
(
timeout
):
try
:
with
socket
.
create_connection
((
host
,
port
),
timeout
=
1
):
return
True
except
(
TimeoutError
,
ConnectionRefusedError
):
time
.
sleep
(
1
)
return
False
def
_cleanup_mooncake_processes
(
timeout_secs
:
int
=
5
)
->
None
:
"""Clean up any existing mooncake_master processes.
Args:
timeout_secs: Maximum seconds to wait for graceful termination.
"""
subprocess
.
run
(
[
"pkill"
,
"-f"
,
"mooncake_master"
],
stdout
=
subprocess
.
DEVNULL
,
stderr
=
subprocess
.
DEVNULL
,
)
start_time
=
time
.
time
()
while
time
.
time
()
-
start_time
<
timeout_secs
:
result
=
subprocess
.
run
(
[
"pgrep"
,
"-f"
,
"mooncake_master"
],
stdout
=
subprocess
.
DEVNULL
,
stderr
=
subprocess
.
DEVNULL
,
)
if
result
.
returncode
!=
0
:
break
time
.
sleep
(
0.5
)
else
:
subprocess
.
run
(
[
"pkill"
,
"-9"
,
"-f"
,
"mooncake_master"
],
stdout
=
subprocess
.
DEVNULL
,
stderr
=
subprocess
.
DEVNULL
,
)
time
.
sleep
(
1
)
def
_load_mooncake_config
(
host
:
str
,
rpc_port
:
int
,
http_port
:
int
)
->
str
:
"""Load Mooncake config from YAML and substitute placeholders.
Args:
host: Mooncake host address.
rpc_port: RPC port for Mooncake master.
http_port: HTTP metadata server port.
Returns:
Path to the temporary config file with substituted values.
"""
config_path
=
str
(
Path
(
__file__
).
parent
/
"stage_configs"
/
"bagel_mooncake_ci.yaml"
)
with
open
(
config_path
)
as
f
:
config_content
=
f
.
read
()
# Substitute placeholders
config_content
=
config_content
.
replace
(
"${MOONCAKE_HOST}"
,
host
)
config_content
=
config_content
.
replace
(
"${MOONCAKE_RPC_PORT}"
,
str
(
rpc_port
))
config_content
=
config_content
.
replace
(
"${MOONCAKE_HTTP_PORT}"
,
str
(
http_port
))
# Write to temp file
temp_file
=
tempfile
.
NamedTemporaryFile
(
mode
=
"w"
,
suffix
=
".yaml"
,
delete
=
False
)
temp_file
.
write
(
config_content
)
temp_file
.
close
()
return
temp_file
.
name
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
diffusion
@
hardware_test
(
res
=
{
"cuda"
:
"H100"
})
def
test_bagel_text2img_mooncake_connector
():
"""Test Bagel text2img with Mooncake connector for inter-stage communication."""
MOONCAKE_HOST
=
"127.0.0.1"
MOONCAKE_RPC_PORT
=
_find_free_port
()
MOONCAKE_HTTP_PORT
=
_find_free_port
()
MOONCAKE_METRICS_PORT
=
_find_free_port
()
mooncake_master_proc
=
None
temp_config_file
=
None
omni
=
None
try
:
_cleanup_mooncake_processes
()
# Start mooncake_master
mooncake_master_proc
=
subprocess
.
Popen
(
[
"mooncake_master"
,
f
"--rpc_port=
{
MOONCAKE_RPC_PORT
}
"
,
"--enable_http_metadata_server=true"
,
"--http_metadata_server_host=0.0.0.0"
,
f
"--http_metadata_server_port=
{
MOONCAKE_HTTP_PORT
}
"
,
f
"--metrics_port=
{
MOONCAKE_METRICS_PORT
}
"
,
],
stdout
=
subprocess
.
DEVNULL
,
stderr
=
subprocess
.
DEVNULL
,
preexec_fn
=
os
.
setsid
,
)
assert
_wait_for_port
(
MOONCAKE_HOST
,
MOONCAKE_RPC_PORT
),
"mooncake_master failed to start"
# Create temp config and initialize Omni
temp_config_file
=
_load_mooncake_config
(
host
=
MOONCAKE_HOST
,
rpc_port
=
MOONCAKE_RPC_PORT
,
http_port
=
MOONCAKE_HTTP_PORT
,
)
omni
=
Omni
(
model
=
"ByteDance-Seed/BAGEL-7B-MoT"
,
stage_configs_path
=
temp_config_file
,
stage_init_timeout
=
300
)
generated_image
=
_generate_bagel_image
(
omni
)
_validate_pixels
(
generated_image
)
finally
:
if
omni
:
omni
.
close
()
if
temp_config_file
:
try
:
os
.
unlink
(
temp_config_file
)
except
OSError
:
pass
if
mooncake_master_proc
:
try
:
os
.
killpg
(
os
.
getpgid
(
mooncake_master_proc
.
pid
),
signal
.
SIGKILL
)
except
OSError
:
pass
tests/e2e/offline_inference/test_cache_dit.py
0 → 100644
View file @
c1cacde6
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
System test for cache-dit backend.
This test verifies that cache-dit acceleration works correctly with diffusion models.
It uses minimal settings to keep test time short for CI.
"""
import
os
import
sys
from
pathlib
import
Path
import
pytest
import
torch
from
vllm_omni.inputs.data
import
OmniDiffusionSamplingParams
# ruff: noqa: E402
REPO_ROOT
=
Path
(
__file__
).
resolve
().
parents
[
2
]
if
str
(
REPO_ROOT
)
not
in
sys
.
path
:
sys
.
path
.
insert
(
0
,
str
(
REPO_ROOT
))
from
vllm_omni
import
Omni
from
vllm_omni.outputs
import
OmniRequestOutput
os
.
environ
[
"VLLM_TEST_CLEAN_GPU_MEMORY"
]
=
"1"
# Use random weights model for testing
models
=
[
"riverclouds/qwen_image_random"
]
@
pytest
.
mark
.
parametrize
(
"model_name"
,
models
)
def
test_cache_dit
(
model_name
:
str
):
"""Test cache-dit backend with diffusion model."""
# Configure cache-dit with minimal settings for fast testing
cache_config
=
{
"Fn_compute_blocks"
:
1
,
"Bn_compute_blocks"
:
0
,
"max_warmup_steps"
:
2
,
# Minimal warmup for fast test
"residual_diff_threshold"
:
0.24
,
"max_continuous_cached_steps"
:
3
,
}
m
=
None
try
:
m
=
Omni
(
model
=
model_name
,
cache_backend
=
"cache_dit"
,
cache_config
=
cache_config
,
)
# Use minimal settings for fast testing
height
=
256
width
=
256
num_inference_steps
=
4
# Minimal steps for fast test
outputs
=
m
.
generate
(
"a photo of a cat sitting on a laptop keyboard"
,
OmniDiffusionSamplingParams
(
height
=
height
,
width
=
width
,
num_inference_steps
=
num_inference_steps
,
guidance_scale
=
0.0
,
generator
=
torch
.
Generator
(
"cuda"
).
manual_seed
(
42
),
num_outputs_per_prompt
=
1
,
# Single output for speed
),
)
# Extract images from request_output[0]['images']
first_output
=
outputs
[
0
]
assert
first_output
.
final_output_type
==
"image"
if
not
hasattr
(
first_output
,
"request_output"
)
or
not
first_output
.
request_output
:
raise
ValueError
(
"No request_output found in OmniRequestOutput"
)
req_out
=
first_output
.
request_output
[
0
]
if
not
isinstance
(
req_out
,
OmniRequestOutput
)
or
not
hasattr
(
req_out
,
"images"
):
raise
ValueError
(
"Invalid request_output structure or missing 'images' key"
)
images
=
req_out
.
images
# Verify generation succeeded
assert
images
is
not
None
assert
len
(
images
)
==
1
# Check image size
assert
images
[
0
].
width
==
width
assert
images
[
0
].
height
==
height
except
Exception
as
e
:
print
(
f
"Test failed with error:
{
e
}
"
)
raise
finally
:
if
m
is
not
None
and
hasattr
(
m
,
"close"
):
m
.
close
()
tests/e2e/offline_inference/test_diffusion_cpu_offload.py
0 → 100644
View file @
c1cacde6
import
sys
from
pathlib
import
Path
import
pytest
import
torch
from
vllm.distributed.parallel_state
import
cleanup_dist_env_and_memory
from
tests.utils
import
GPUMemoryMonitor
from
vllm_omni.inputs.data
import
OmniDiffusionSamplingParams
from
vllm_omni.platforms
import
current_omni_platform
# ruff: noqa: E402
REPO_ROOT
=
Path
(
__file__
).
resolve
().
parents
[
2
]
if
str
(
REPO_ROOT
)
not
in
sys
.
path
:
sys
.
path
.
insert
(
0
,
str
(
REPO_ROOT
))
from
vllm_omni
import
Omni
models
=
[
"riverclouds/qwen_image_random"
]
def
inference
(
model_name
:
str
,
offload
:
bool
=
True
):
current_omni_platform
.
empty_cache
()
device_index
=
torch
.
cuda
.
current_device
()
monitor
=
GPUMemoryMonitor
(
device_index
=
device_index
,
interval
=
0.02
)
monitor
.
start
()
m
=
Omni
(
model
=
model_name
,
enable_cpu_offload
=
offload
)
torch
.
cuda
.
reset_peak_memory_stats
(
device
=
device_index
)
height
=
256
width
=
256
m
.
generate
(
"a photo of a cat sitting on a laptop keyboard"
,
OmniDiffusionSamplingParams
(
height
=
height
,
width
=
width
,
num_inference_steps
=
9
,
guidance_scale
=
0.0
,
generator
=
torch
.
Generator
(
"cuda"
).
manual_seed
(
42
),
),
)
peak
=
monitor
.
peak_used_mb
monitor
.
stop
()
return
peak
@
pytest
.
mark
.
skipif
(
current_omni_platform
.
is_npu
()
or
current_omni_platform
.
is_rocm
(),
reason
=
"Hardware not supported"
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
models
)
def
test_cpu_offload_diffusion_model
(
model_name
:
str
):
try
:
no_offload_peak_memory
=
inference
(
model_name
,
offload
=
False
)
cleanup_dist_env_and_memory
()
offload_peak_memory
=
inference
(
model_name
,
offload
=
True
)
except
Exception
:
pytest
.
fail
(
"Inference failed"
)
print
(
f
"Offload peak memory:
{
offload_peak_memory
}
MB"
)
print
(
f
"No offload peak memory:
{
no_offload_peak_memory
}
MB"
)
assert
offload_peak_memory
+
2500
<
no_offload_peak_memory
,
(
f
"Offload peak memory
{
offload_peak_memory
}
MB should be less than no offload peak memory
{
no_offload_peak_memory
}
MB"
)
tests/e2e/offline_inference/test_diffusion_layerwise_offload.py
0 → 100644
View file @
c1cacde6
import
sys
from
pathlib
import
Path
import
pytest
import
torch
from
vllm.distributed.parallel_state
import
cleanup_dist_env_and_memory
from
tests.utils
import
GPUMemoryMonitor
from
vllm_omni.inputs.data
import
OmniDiffusionSamplingParams
from
vllm_omni.platforms
import
current_omni_platform
# ruff: noqa: E402
REPO_ROOT
=
Path
(
__file__
).
resolve
().
parents
[
2
]
if
str
(
REPO_ROOT
)
not
in
sys
.
path
:
sys
.
path
.
insert
(
0
,
str
(
REPO_ROOT
))
from
vllm_omni
import
Omni
# Models to test and expected saved memory in MB, correspondingly
MODELS_SAVED_MEMORY_MB
=
{
"riverclouds/qwen_image_random"
:
4500
}
def
run_inference
(
model_name
:
str
,
layerwise_offload
:
bool
=
False
,
num_gpu_layers
:
int
=
1
,
num_inference_steps
:
int
=
3
,
)
->
float
:
# For now, only support on GPU, so apply torch.cuda operations here
# NPU / ROCm platforms are expected to be detected and skipped this test function
torch
.
cuda
.
empty_cache
()
device_index
=
torch
.
cuda
.
current_device
()
monitor
=
GPUMemoryMonitor
(
device_index
=
device_index
,
interval
=
0.02
)
monitor
.
start
()
m
=
Omni
(
model
=
model_name
,
enable_layerwise_offload
=
layerwise_offload
,
layerwise_num_gpu_layers
=
num_gpu_layers
,
boundary_ratio
=
0.875
,
flow_shift
=
5.0
,
)
torch
.
cuda
.
reset_peak_memory_stats
(
device
=
device_index
)
# Refer to tests/e2e/offline_inference/test_t2v_model.py
# Use minimal settings for testing
height
=
480
width
=
640
num_frames
=
5
m
.
generate
(
"A cat sitting on a table"
,
OmniDiffusionSamplingParams
(
height
=
height
,
width
=
width
,
generator
=
torch
.
Generator
(
"cuda"
).
manual_seed
(
42
),
guidance_scale
=
1.0
,
num_inference_steps
=
num_inference_steps
,
num_frames
=
num_frames
,
),
)
peak
=
monitor
.
peak_used_mb
monitor
.
stop
()
return
peak
@
pytest
.
mark
.
skipif
(
current_omni_platform
.
is_npu
()
or
current_omni_platform
.
is_rocm
(),
reason
=
"Hardware not supported"
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS_SAVED_MEMORY_MB
.
keys
())
def
test_layerwise_offload_diffusion_model
(
model_name
:
str
):
"""Test that layerwise offloading reduces GPU memory usage.
This test verifies that layerwise offloading significantly reduces peak
GPU memory usage compared to loading the entire model on GPU. The layerwise
offloader keeps only a single transformer block on GPU at a time, with
prefetching for compute-memory overlap.
"""
try
:
# Run without layerwise offloading (baseline)
no_offload_peak_memory
=
run_inference
(
model_name
,
layerwise_offload
=
False
)
cleanup_dist_env_and_memory
()
# Run with layerwise offloading (1 layer on device)
layerwise_offload_peak_memory
=
run_inference
(
model_name
,
layerwise_offload
=
True
,
num_gpu_layers
=
1
)
cleanup_dist_env_and_memory
()
# Run with 2 layers on device
layerwise_offload_two_layers_peak
=
run_inference
(
model_name
,
layerwise_offload
=
True
,
num_gpu_layers
=
2
)
except
Exception
:
pytest
.
fail
(
"Inference failed"
)
print
(
f
"Layerwise offload peak memory (1 GPU layer):
{
layerwise_offload_peak_memory
}
MB"
)
print
(
f
"Layerwise offload peak memory (2 GPU layers):
{
layerwise_offload_two_layers_peak
}
MB"
)
print
(
f
"No offload peak memory:
{
no_offload_peak_memory
}
MB"
)
# Verify that layerwise offloading significantly reduces memory usage
# Passes only if the actual savings exceeds the expected savings
assert
layerwise_offload_peak_memory
+
MODELS_SAVED_MEMORY_MB
[
model_name
]
<
no_offload_peak_memory
,
(
f
"Layerwise offload peak memory
{
layerwise_offload_peak_memory
}
MB "
f
"should be significantly less than no offload peak memory
{
no_offload_peak_memory
}
MB"
)
# Verify that 2 GPU layers uses more memory than 1 GPU layer
# But not excessively more (should be a reasonable increase)
assert
layerwise_offload_peak_memory
<
layerwise_offload_two_layers_peak
,
(
f
"1 GPU layer peak
{
layerwise_offload_peak_memory
}
MB should be < "
f
"2 GPU layers peak
{
layerwise_offload_two_layers_peak
}
MB"
)
tests/e2e/offline_inference/test_diffusion_lora.py
0 → 100644
View file @
c1cacde6
import
json
import
os
import
sys
from
pathlib
import
Path
import
pytest
import
torch
from
safetensors.torch
import
save_file
from
vllm_omni.inputs.data
import
OmniDiffusionSamplingParams
from
vllm_omni.outputs
import
OmniRequestOutput
# ruff: noqa: E402
REPO_ROOT
=
Path
(
__file__
).
resolve
().
parents
[
2
]
if
str
(
REPO_ROOT
)
not
in
sys
.
path
:
sys
.
path
.
insert
(
0
,
str
(
REPO_ROOT
))
from
vllm_omni
import
Omni
os
.
environ
[
"VLLM_TEST_CLEAN_GPU_MEMORY"
]
=
"1"
# This test is specific to Z-Image LoRA behavior. Keep it focused on a single
# model to reduce runtime and avoid extra downloads.
models
=
[
"Tongyi-MAI/Z-Image-Turbo"
]
@
pytest
.
mark
.
parametrize
(
"model_name"
,
models
)
def
test_diffusion_model
(
model_name
:
str
,
tmp_path
:
Path
):
def
_extract_images
(
outputs
:
list
[
OmniRequestOutput
]):
if
not
outputs
:
raise
ValueError
(
"Empty outputs from Omni.generate()"
)
first_output
=
outputs
[
0
]
assert
first_output
.
final_output_type
==
"image"
if
not
hasattr
(
first_output
,
"request_output"
)
or
not
first_output
.
request_output
:
raise
ValueError
(
"No request_output found in OmniRequestOutput"
)
req_out
=
first_output
.
request_output
[
0
]
if
not
isinstance
(
req_out
,
OmniRequestOutput
)
or
not
hasattr
(
req_out
,
"images"
):
raise
ValueError
(
"Invalid request_output structure or missing 'images' key"
)
return
req_out
.
images
def
_write_zimage_lora
(
adapter_dir
:
Path
)
->
str
:
adapter_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
# Z-Image transformer uses dim=3840 by default (see ZImageTransformer2DModel).
dim
=
3840
module_name
=
"transformer.layers.0.attention.to_qkv"
rank
=
1
lora_a
=
torch
.
zeros
((
rank
,
dim
),
dtype
=
torch
.
float32
)
lora_a
[
0
,
0
]
=
1.0
# QKVParallelLinear packs (Q, K, V). With tp=1 and n_kv_heads==n_heads in Z-Image,
# each slice is `dim`, so total out dim is `3 * dim`.
lora_b
=
torch
.
zeros
((
3
*
dim
,
rank
),
dtype
=
torch
.
float32
)
# Apply a visible delta to the Q slice only to keep the perturbation bounded.
lora_b
[:
dim
,
0
]
=
0.1
save_file
(
{
f
"base_model.model.
{
module_name
}
.lora_A.weight"
:
lora_a
,
f
"base_model.model.
{
module_name
}
.lora_B.weight"
:
lora_b
,
},
str
(
adapter_dir
/
"adapter_model.safetensors"
),
)
(
adapter_dir
/
"adapter_config.json"
).
write_text
(
json
.
dumps
(
{
"r"
:
rank
,
"lora_alpha"
:
rank
,
"target_modules"
:
[
module_name
],
}
),
encoding
=
"utf-8"
,
)
return
str
(
adapter_dir
)
m
=
Omni
(
model
=
model_name
)
try
:
# high resolution may cause OOM on L4
height
=
256
width
=
256
prompt
=
"a photo of a cat sitting on a laptop keyboard"
outputs
=
m
.
generate
(
prompt
,
OmniDiffusionSamplingParams
(
height
=
height
,
width
=
width
,
num_inference_steps
=
2
,
guidance_scale
=
0.0
,
generator
=
torch
.
Generator
(
"cuda"
).
manual_seed
(
42
),
num_outputs_per_prompt
=
1
,
),
)
images
=
_extract_images
(
outputs
)
assert
len
(
images
)
==
1
# check image size
assert
images
[
0
].
width
==
width
assert
images
[
0
].
height
==
height
# Real LoRA E2E: generate again with a real on-disk PEFT adapter and
# verify that output changes.
if
model_name
==
"Tongyi-MAI/Z-Image-Turbo"
:
from
vllm_omni.lora.request
import
LoRARequest
from
vllm_omni.lora.utils
import
stable_lora_int_id
lora_dir
=
_write_zimage_lora
(
tmp_path
/
"zimage_lora"
)
lora_request
=
LoRARequest
(
lora_name
=
"test"
,
lora_int_id
=
stable_lora_int_id
(
lora_dir
),
lora_path
=
lora_dir
,
)
outputs_lora
=
m
.
generate
(
prompt
,
OmniDiffusionSamplingParams
(
height
=
height
,
width
=
width
,
num_inference_steps
=
2
,
guidance_scale
=
0.0
,
generator
=
torch
.
Generator
(
"cuda"
).
manual_seed
(
42
),
num_outputs_per_prompt
=
1
,
lora_request
=
lora_request
,
lora_scale
=
2.0
,
),
)
images_lora
=
_extract_images
(
outputs_lora
)
assert
len
(
images_lora
)
==
1
assert
images_lora
[
0
].
width
==
width
assert
images_lora
[
0
].
height
==
height
import
numpy
as
np
diff
=
np
.
abs
(
np
.
array
(
images
[
0
],
dtype
=
np
.
int16
)
-
np
.
array
(
images_lora
[
0
],
dtype
=
np
.
int16
)).
mean
()
assert
diff
>
0.0
finally
:
m
.
close
()
tests/e2e/offline_inference/test_ovis_image.py
0 → 100644
View file @
c1cacde6
"""
Tests for Ovis Image model pipeline.
Strategy:
1. `mock_dependencies` fixture mocks heavy external components (VAE, Scheduler, TextEncoder)
to allow fast testing of the pipeline logic without downloading weights.
- Mocks are configured to return tensors on the correct device.
- Transformer is mocked dynamically to return random noise of correct shape.
2. `test_real_transformer_init_and_forward` tests the actual `OvisImageTransformer2DModel`
initialization and forward pass with a small configuration to ensure code coverage
and correctness of the model definition itself, independent of the pipeline mocks.
"""
from
unittest.mock
import
MagicMock
,
patch
import
pytest
import
torch
from
vllm_omni.diffusion.data
import
OmniDiffusionConfig
,
TransformerConfig
# Mock the OvisImageTransformer2DModel to avoid complex init if needed,
# or let it run if it's lightweight. It's likely not lightweight.
# Better to mock the transformer forwarding to return random noise.
from
vllm_omni.diffusion.distributed.utils
import
get_local_device
from
vllm_omni.diffusion.models.ovis_image.pipeline_ovis_image
import
OvisImagePipeline
from
vllm_omni.diffusion.request
import
OmniDiffusionRequest
from
vllm_omni.inputs.data
import
OmniDiffusionSamplingParams
@
pytest
.
fixture
def
mock_dependencies
(
monkeypatch
):
"""
Mock external dependencies to avoid loading real models.
"""
device
=
get_local_device
()
# Mock Tokenizer
mock_tokenizer
=
MagicMock
()
mock_tokenizer
.
return_value
=
MagicMock
(
input_ids
=
torch
.
zeros
((
1
,
50
),
dtype
=
torch
.
long
,
device
=
device
),
attention_mask
=
torch
.
ones
((
1
,
50
),
dtype
=
torch
.
long
,
device
=
device
),
)
mock_tokenizer
.
apply_chat_template
.
return_value
=
"dummy prompt"
mock_tokenizer
.
model_max_length
=
1024
# Mock Text Encoder
mock_text_encoder
=
MagicMock
()
mock_text_encoder
.
dtype
=
torch
.
float32
# Output of text encoder must be on the same device as inputs (which are moved to execution_device)
mock_text_encoder
.
return_value
.
last_hidden_state
=
torch
.
randn
(
1
,
50
,
32
,
device
=
device
)
# Mock VAE
mock_vae
=
MagicMock
()
mock_vae
.
config
.
block_out_channels
=
[
128
,
256
,
512
,
512
]
# Scale factor 8
mock_vae
.
config
.
scale_factor_temporal
=
1
mock_vae
.
config
.
scale_factor_spatial
=
8
mock_vae
.
config
.
scaling_factor
=
0.18215
mock_vae
.
config
.
shift_factor
=
0.0
# Decode return value
mock_vae
.
decode
.
return_value
=
[
torch
.
randn
(
1
,
3
,
128
,
128
,
device
=
device
)]
# Ensure .to() returns self so configuration persists
mock_vae
.
to
.
return_value
=
mock_vae
# Mock Scheduler
mock_scheduler
=
MagicMock
()
mock_scheduler
.
config
=
MagicMock
()
# Timesteps on device to match latents during denoising loop interaction if needed
mock_scheduler
.
timesteps
=
torch
.
tensor
([
1.0
,
0.5
,
0.0
],
device
=
device
)
mock_scheduler
.
set_timesteps
.
return_value
=
None
# Make step return dynamic based on input sample shape
def
mock_scheduler_step
(
model_output
,
timestep
,
sample
,
**
kwargs
):
# sample is the latents, should be preserved
return
(
torch
.
randn_like
(
sample
),)
mock_scheduler
.
step
.
side_effect
=
mock_scheduler_step
module_path
=
"vllm_omni.diffusion.models.ovis_image.pipeline_ovis_image"
monkeypatch
.
setattr
(
f
"
{
module_path
}
.Qwen2TokenizerFast.from_pretrained"
,
lambda
*
a
,
**
k
:
mock_tokenizer
)
monkeypatch
.
setattr
(
f
"
{
module_path
}
.Qwen3Model.from_pretrained"
,
lambda
*
a
,
**
k
:
mock_text_encoder
)
monkeypatch
.
setattr
(
f
"
{
module_path
}
.AutoencoderKL.from_pretrained"
,
lambda
*
a
,
**
k
:
mock_vae
)
monkeypatch
.
setattr
(
f
"
{
module_path
}
.FlowMatchEulerDiscreteScheduler.from_pretrained"
,
lambda
*
a
,
**
k
:
mock_scheduler
)
return
{
"tokenizer"
:
mock_tokenizer
,
"text_encoder"
:
mock_text_encoder
,
"vae"
:
mock_vae
,
"scheduler"
:
mock_scheduler
,
"device"
:
device
,
}
@
pytest
.
fixture
def
ovis_pipeline
(
mock_dependencies
,
monkeypatch
):
"""
Creates an OvisImagePipeline instance with mocked components.
"""
# Create config
tf_config
=
TransformerConfig
(
params
=
{
"in_channels"
:
4
,
"out_channels"
:
4
,
"sample_size"
:
32
,
"patch_size"
:
2
,
"num_attention_heads"
:
4
,
"attention_head_dim"
:
8
,
"num_layers"
:
1
,
"caption_channels"
:
32
,
}
)
od_config
=
OmniDiffusionConfig
(
model
=
"dummy-ovis"
,
tf_model_config
=
tf_config
,
dtype
=
torch
.
float32
,
num_gpus
=
1
,
)
# Mock Transformer Layer separately to avoid full init
# We patch OvisImageTransformer2DModel class in the module
mock_transformer_cls
=
MagicMock
()
mock_transformer_instance
=
MagicMock
()
mock_transformer_instance
.
dtype
=
torch
.
float32
mock_transformer_instance
.
in_channels
=
16
# Must be 16 so num_channel_latents=4, packed=16
# Forward return: noise prediction
def
mock_forward
(
hidden_states
,
*
args
,
**
kwargs
):
# hidden_states shape: (B, SeqLen, Channels)
return
(
torch
.
randn_like
(
hidden_states
),)
mock_transformer_instance
.
forward
.
side_effect
=
mock_forward
# Also make the instance itself callable to mimic __call__
mock_transformer_instance
.
side_effect
=
mock_forward
mock_transformer_cls
.
return_value
=
mock_transformer_instance
monkeypatch
.
setattr
(
"vllm_omni.diffusion.models.ovis_image.pipeline_ovis_image.OvisImageTransformer2DModel"
,
mock_transformer_cls
)
# Initialize pipeline
# We use a dummy model path check override
with
patch
(
"os.path.exists"
,
return_value
=
True
):
pipeline
=
OvisImagePipeline
(
od_config
=
od_config
)
return
pipeline
def
test_interface_compliance
(
ovis_pipeline
):
"""Verify methods required by vllm-omni framework."""
assert
hasattr
(
ovis_pipeline
,
"load_weights"
)
assert
hasattr
(
ovis_pipeline
,
"scheduler"
)
assert
hasattr
(
ovis_pipeline
,
"transformer"
)
assert
hasattr
(
ovis_pipeline
,
"text_encoder"
)
# assert hasattr(ovis_pipeline, "vae") # Ovis uses VAE
def
test_basic_generation
(
ovis_pipeline
):
"""Test the forward pass logic."""
# Setup request
req
=
OmniDiffusionRequest
(
prompts
=
[
"A photo of a cat"
],
sampling_params
=
OmniDiffusionSamplingParams
(
height
=
256
,
width
=
256
,
num_inference_steps
=
2
,
guidance_scale
=
1.0
,
),
)
output
=
ovis_pipeline
(
req
)
assert
output
is
not
None
assert
output
.
output
is
not
None
# Output should be a tensor from mocked VAE decode [torch.randn(1, 3, 128, 128)]
assert
isinstance
(
output
.
output
,
torch
.
Tensor
)
assert
output
.
output
.
shape
==
(
1
,
3
,
128
,
128
)
# Check that transformer was called
assert
ovis_pipeline
.
transformer
.
call_count
>
0
def
test_guidance_scale
(
ovis_pipeline
):
"""Test that classifier-free guidance path is taken when scale > 1.0."""
req
=
OmniDiffusionRequest
(
prompts
=
[
{
"prompt"
:
"A photo of a cat"
,
"negative_prompt"
:
"bad quality"
,
}
],
sampling_params
=
OmniDiffusionSamplingParams
(
height
=
256
,
width
=
256
,
num_inference_steps
=
1
,
guidance_scale
=
2.0
,
# Trigger CFG
),
)
ovis_pipeline
(
req
)
assert
ovis_pipeline
.
transformer
.
call_count
>=
2
def
test_resolution_check
(
ovis_pipeline
):
"""Test resolution divisible validation logic if present."""
# Pass odd resolution
req
=
OmniDiffusionRequest
(
prompts
=
[
"test"
],
sampling_params
=
OmniDiffusionSamplingParams
(
height
=
250
,
# Not divisible by 16 (8*2)
width
=
250
,
),
)
# Should warn but proceed (as per code I read earlier) or resize?
# The code had `logger.warning(...)`
output
=
ovis_pipeline
(
req
)
assert
output
is
not
None
def
test_real_transformer_init_and_forward
():
"""Test the real OvisImageTransformer2DModel initialization and forward pass for coverage."""
from
unittest.mock
import
patch
from
vllm_omni.diffusion.models.ovis_image.ovis_image_transformer
import
OvisImageTransformer2DModel
device
=
get_local_device
()
tf_config
=
TransformerConfig
(
params
=
{
"patch_size"
:
2
,
"in_channels"
:
16
,
"out_channels"
:
16
,
"num_layers"
:
1
,
"num_single_layers"
:
1
,
"attention_head_dim"
:
8
,
"num_attention_heads"
:
2
,
"joint_attention_dim"
:
32
,
"axes_dims_rope"
:
(
4
,
4
,
4
),
}
)
od_config
=
OmniDiffusionConfig
(
model
=
"dummy-ovis"
,
tf_model_config
=
tf_config
,
dtype
=
torch
.
bfloat16
,
num_gpus
=
1
)
torch
.
set_default_dtype
(
torch
.
bfloat16
)
# Mock distributed state for QKVParallelLinear initialization
# We patch get_tp_group because get_tensor_model_parallel_rank calls it and asserts _TP is not None
mock_group
=
MagicMock
()
mock_group
.
rank_in_group
=
0
mock_group
.
world_size
=
1
with
patch
(
"vllm.distributed.parallel_state.get_tp_group"
,
return_value
=
mock_group
):
# Initialize real model
model
=
OvisImageTransformer2DModel
(
od_config
=
od_config
,
patch_size
=
1
,
in_channels
=
16
,
out_channels
=
16
,
num_single_layers
=
1
,
attention_head_dim
=
8
,
num_attention_heads
=
2
,
joint_attention_dim
=
32
,
axes_dims_rope
=
(
2
,
2
,
4
),
).
to
(
device
)
# Create dummy inputs
B
,
Seq
,
C
=
1
,
16
,
16
hidden_states
=
torch
.
randn
(
B
,
Seq
,
C
,
device
=
device
)
encoder_hidden_states
=
torch
.
randn
(
B
,
10
,
32
,
device
=
device
)
# joint_attention_dim=32
timestep
=
torch
.
tensor
([
1
],
device
=
device
)
img_ids
=
torch
.
zeros
(
Seq
,
3
,
device
=
device
)
txt_ids
=
torch
.
zeros
(
10
,
3
,
device
=
device
)
# Run forward
output
=
model
(
hidden_states
=
hidden_states
,
encoder_hidden_states
=
encoder_hidden_states
,
timestep
=
timestep
,
img_ids
=
img_ids
,
txt_ids
=
txt_ids
,
return_dict
=
False
,
)
assert
output
is
not
None
assert
isinstance
(
output
,
tuple
)
assert
output
[
0
].
shape
==
hidden_states
.
shape
tests/e2e/offline_inference/test_qwen2_5_omni.py
0 → 100644
View file @
c1cacde6
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
E2E tests for Qwen2.5-Omni model with mixed modality inputs and audio output.
"""
from
pathlib
import
Path
import
pytest
from
vllm.assets.audio
import
AudioAsset
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.envs
import
VLLM_USE_MODELSCOPE
from
vllm.multimodal.image
import
convert_image_mode
from
vllm_omni.platforms
import
current_omni_platform
from
.conftest
import
OmniRunner
from
.utils
import
create_new_process_for_each_test
models
=
[
"Qwen/Qwen2.5-Omni-3B"
]
# CI stage config optimized for 24GB GPU (L4/RTX3090) or NPU
if
current_omni_platform
.
is_npu
():
stage_config
=
str
(
Path
(
__file__
).
parent
/
"stage_configs"
/
"npu"
/
"qwen2_5_omni_ci.yaml"
)
elif
current_omni_platform
.
is_rocm
():
# ROCm stage config optimized for MI325 GPU
stage_config
=
str
(
Path
(
__file__
).
parent
/
"stage_configs"
/
"rocm"
/
"qwen2_5_omni_ci.yaml"
)
else
:
stage_config
=
str
(
Path
(
__file__
).
parent
/
"stage_configs"
/
"qwen2_5_omni_ci.yaml"
)
# Create parameter combinations for model and stage config
test_params
=
[(
model
,
stage_config
)
for
model
in
models
]
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"test_config"
,
test_params
)
@
create_new_process_for_each_test
(
"spawn"
)
def
test_mixed_modalities_to_audio
(
omni_runner
:
type
[
OmniRunner
],
test_config
:
tuple
[
str
,
str
])
->
None
:
"""Test processing audio, image, and video together, generating audio output."""
model
,
stage_config_path
=
test_config
with
omni_runner
(
model
,
seed
=
42
,
stage_configs_path
=
stage_config_path
)
as
runner
:
# Prepare multimodal inputs
question
=
"What is recited in the audio? What is in this image? Describe the video briefly."
audio
=
AudioAsset
(
"mary_had_lamb"
).
audio_and_sample_rate
audio
=
(
audio
[
0
][:
16000
*
5
],
audio
[
1
])
# Trim to first 5 seconds
image
=
convert_image_mode
(
ImageAsset
(
"cherry_blossom"
).
pil_image
.
resize
((
128
,
128
)),
"RGB"
)
if
not
VLLM_USE_MODELSCOPE
:
video
=
VideoAsset
(
name
=
"baby_reading"
,
num_frames
=
4
).
np_ndarrays
else
:
# modelscope can't access raushan-testing-hf/videos-test, skip video input temporarily
video
=
None
outputs
=
runner
.
generate_multimodal
(
prompts
=
question
,
audios
=
audio
,
images
=
image
,
videos
=
video
,
)
# Find and verify text output (thinker stage)
text_output
=
None
output_count
=
0
for
stage_output
in
outputs
:
if
stage_output
.
final_output_type
==
"text"
:
text_output
=
stage_output
output_count
+=
1
break
assert
output_count
>
0
assert
text_output
is
not
None
assert
len
(
text_output
.
request_output
)
>
0
text_content
=
text_output
.
request_output
[
0
].
outputs
[
0
].
text
assert
text_content
is
not
None
assert
len
(
text_content
.
strip
())
>
0
# Find and verify audio output (code2wav stage)
audio_output
=
None
output_count
=
0
for
stage_output
in
outputs
:
if
stage_output
.
final_output_type
==
"audio"
:
audio_output
=
stage_output
output_count
+=
1
break
assert
output_count
>
0
assert
audio_output
is
not
None
assert
len
(
audio_output
.
request_output
)
>
0
# Verify audio tensor exists and has content
audio_tensor
=
audio_output
.
request_output
[
0
].
outputs
[
0
].
multimodal_output
[
"audio"
]
assert
audio_tensor
is
not
None
assert
audio_tensor
.
numel
()
>
0
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"test_config"
,
test_params
)
@
create_new_process_for_each_test
(
"spawn"
)
def
test_mixed_modalities_to_text_only
(
omni_runner
:
type
[
OmniRunner
],
test_config
:
tuple
[
str
,
str
])
->
None
:
"""Test processing audio, image, and video together, generating audio output."""
model
,
stage_config_path
=
test_config
with
omni_runner
(
model
,
seed
=
42
,
stage_configs_path
=
stage_config_path
)
as
runner
:
# Prepare multimodal inputs
question
=
"What is recited in the audio? What is in this image? Describe the video briefly."
audio
=
AudioAsset
(
"mary_had_lamb"
).
audio_and_sample_rate
audio
=
(
audio
[
0
][:
16000
*
5
],
audio
[
1
])
# Trim to first 5 seconds
image
=
convert_image_mode
(
ImageAsset
(
"cherry_blossom"
).
pil_image
.
resize
((
128
,
128
)),
"RGB"
)
video
=
VideoAsset
(
name
=
"baby_reading"
,
num_frames
=
4
).
np_ndarrays
modalities
=
[
"text"
]
outputs
=
runner
.
generate_multimodal
(
prompts
=
question
,
audios
=
audio
,
images
=
image
,
videos
=
video
,
modalities
=
modalities
,
)
# Find and verify text output (thinker stage)
text_output
=
None
output_count
=
0
for
stage_output
in
outputs
:
assert
stage_output
.
final_output_type
!=
"audio"
if
stage_output
.
final_output_type
==
"text"
:
text_output
=
stage_output
output_count
+=
1
break
assert
output_count
>
0
assert
text_output
is
not
None
assert
len
(
text_output
.
request_output
)
>
0
text_content
=
text_output
.
request_output
[
0
].
outputs
[
0
].
text
assert
text_content
is
not
None
assert
len
(
text_content
.
strip
())
>
0
tests/e2e/offline_inference/test_qwen3_omni.py
0 → 100644
View file @
c1cacde6
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
E2E offline tests for Omni model with video input and audio output.
"""
import
os
os
.
environ
[
"VLLM_WORKER_MULTIPROC_METHOD"
]
=
"spawn"
os
.
environ
[
"VLLM_TEST_CLEAN_GPU_MEMORY"
]
=
"0"
from
pathlib
import
Path
import
pytest
from
vllm.assets.video
import
VideoAsset
from
vllm_omni.platforms
import
current_omni_platform
from
.conftest
import
OmniRunner
models
=
[
"Qwen/Qwen3-Omni-30B-A3B-Instruct"
]
# CI stage config for 2xH100-80G GPUs or AMD GPU MI325
if
current_omni_platform
.
is_rocm
():
# ROCm stage config optimized for MI325 GPU
stage_configs
=
[
str
(
Path
(
__file__
).
parent
/
"stage_configs"
/
"rocm"
/
"qwen3_omni_ci.yaml"
)]
else
:
stage_configs
=
[
str
(
Path
(
__file__
).
parent
/
"stage_configs"
/
"qwen3_omni_ci.yaml"
)]
# Create parameter combinations for model and stage config
test_params
=
[(
model
,
stage_config
)
for
model
in
models
for
stage_config
in
stage_configs
]
@
pytest
.
mark
.
parametrize
(
"test_config"
,
test_params
)
def
test_video_to_audio
(
omni_runner
:
type
[
OmniRunner
],
test_config
)
->
None
:
"""Test processing video, generating audio output."""
model
,
stage_config_path
=
test_config
with
omni_runner
(
model
,
seed
=
42
,
stage_configs_path
=
stage_config_path
,
stage_init_timeout
=
300
)
as
runner
:
# Prepare inputs
question
=
"Describe the video briefly."
video
=
VideoAsset
(
name
=
"baby_reading"
,
num_frames
=
4
).
np_ndarrays
outputs
=
runner
.
generate_multimodal
(
prompts
=
question
,
videos
=
video
,
)
# Find and verify text output (thinker stage)
text_output
=
None
output_count
=
0
for
stage_output
in
outputs
:
if
stage_output
.
final_output_type
==
"text"
:
text_output
=
stage_output
output_count
+=
1
break
assert
output_count
>
0
assert
text_output
is
not
None
assert
len
(
text_output
.
request_output
)
>
0
text_content
=
text_output
.
request_output
[
0
].
outputs
[
0
].
text
assert
text_content
is
not
None
assert
len
(
text_content
.
strip
())
>
0
# Find and verify audio output (code2wav stage)
audio_output
=
None
output_count
=
0
for
stage_output
in
outputs
:
if
stage_output
.
final_output_type
==
"audio"
:
audio_output
=
stage_output
output_count
+=
1
break
assert
output_count
>
0
assert
audio_output
is
not
None
assert
len
(
audio_output
.
request_output
)
>
0
# Verify audio tensor exists and has content
audio_tensor
=
audio_output
.
request_output
[
0
].
outputs
[
0
].
multimodal_output
[
"audio"
]
assert
audio_tensor
is
not
None
assert
audio_tensor
.
numel
()
>
0
tests/e2e/offline_inference/test_sequence_parallel.py
0 → 100644
View file @
c1cacde6
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
System test for Sequence Parallel (SP) backends: Ulysses and Ring attention.
Tests verify that SP inference produces correct outputs compared to baseline.
"""
import
gc
import
os
import
sys
import
time
from
pathlib
import
Path
from
typing
import
NamedTuple
import
numpy
as
np
import
pytest
import
torch
import
torch.distributed
as
dist
from
PIL
import
Image
from
vllm_omni.inputs.data
import
OmniDiffusionSamplingParams
# ruff: noqa: E402
REPO_ROOT
=
Path
(
__file__
).
resolve
().
parents
[
3
]
if
str
(
REPO_ROOT
)
not
in
sys
.
path
:
sys
.
path
.
insert
(
0
,
str
(
REPO_ROOT
))
from
vllm_omni
import
Omni
from
vllm_omni.diffusion.data
import
DiffusionParallelConfig
from
vllm_omni.platforms
import
current_omni_platform
# Test configuration
MODELS
=
[
"riverclouds/qwen_image_random"
]
PROMPT
=
"a photo of a cat sitting on a laptop keyboard"
DEFAULT_HEIGHT
=
256
DEFAULT_WIDTH
=
256
DEFAULT_SEED
=
42
DEFAULT_STEPS
=
4
DIFF_MEAN_THRESHOLD
=
2e-2
DIFF_MAX_THRESHOLD
=
2e-1
class
InferenceResult
(
NamedTuple
):
"""Result of an inference run."""
images
:
list
[
Image
.
Image
]
elapsed_ms
:
float
def
_cleanup_distributed
():
"""Clean up distributed environment and GPU resources."""
if
dist
.
is_initialized
():
dist
.
destroy_process_group
()
for
key
in
[
"MASTER_ADDR"
,
"MASTER_PORT"
,
"RANK"
,
"WORLD_SIZE"
,
"LOCAL_RANK"
]:
os
.
environ
.
pop
(
key
,
None
)
gc
.
collect
()
if
current_omni_platform
.
is_available
():
current_omni_platform
.
empty_cache
()
current_omni_platform
.
synchronize
()
time
.
sleep
(
5
)
def
_diff_metrics
(
a
:
Image
.
Image
,
b
:
Image
.
Image
)
->
tuple
[
float
,
float
]:
"""Return (mean_abs_diff, max_abs_diff) over RGB pixels in [0, 1]."""
ta
=
torch
.
from_numpy
(
np
.
asarray
(
a
.
convert
(
"RGB"
),
dtype
=
np
.
float32
)
/
255.0
)
tb
=
torch
.
from_numpy
(
np
.
asarray
(
b
.
convert
(
"RGB"
),
dtype
=
np
.
float32
)
/
255.0
)
assert
ta
.
shape
==
tb
.
shape
,
f
"Image shapes differ:
{
ta
.
shape
}
vs
{
tb
.
shape
}
"
abs_diff
=
torch
.
abs
(
ta
-
tb
)
return
abs_diff
.
mean
().
item
(),
abs_diff
.
max
().
item
()
def
_run_inference
(
model_name
:
str
,
dtype
:
torch
.
dtype
,
attn_backend
:
str
,
ulysses_degree
:
int
=
1
,
ring_degree
:
int
=
1
,
height
:
int
=
DEFAULT_HEIGHT
,
width
:
int
=
DEFAULT_WIDTH
,
seed
:
int
=
DEFAULT_SEED
,
warmup
:
bool
=
True
,
)
->
InferenceResult
:
"""Run inference with specified configuration.
Args:
warmup: If True, run one warmup iteration before the timed run.
"""
parallel_config
=
DiffusionParallelConfig
(
ulysses_degree
=
ulysses_degree
,
ring_degree
=
ring_degree
)
omni
=
Omni
(
model
=
model_name
,
parallel_config
=
parallel_config
,
dtype
=
dtype
,
attention_backend
=
attn_backend
,
)
try
:
# Warmup run (not timed)
if
warmup
:
_
=
omni
.
generate
(
PROMPT
,
OmniDiffusionSamplingParams
(
height
=
height
,
width
=
width
,
num_inference_steps
=
DEFAULT_STEPS
,
guidance_scale
=
0.0
,
generator
=
torch
.
Generator
(
current_omni_platform
.
device_type
).
manual_seed
(
seed
+
1000
),
num_outputs_per_prompt
=
1
,
),
)
# Timed run
start
=
time
.
time
()
outputs
=
omni
.
generate
(
PROMPT
,
OmniDiffusionSamplingParams
(
height
=
height
,
width
=
width
,
num_inference_steps
=
DEFAULT_STEPS
,
guidance_scale
=
0.0
,
generator
=
torch
.
Generator
(
current_omni_platform
.
device_type
).
manual_seed
(
seed
),
num_outputs_per_prompt
=
1
,
),
)
elapsed_ms
=
(
time
.
time
()
-
start
)
*
1000
return
InferenceResult
(
images
=
outputs
[
0
].
request_output
[
0
].
images
,
elapsed_ms
=
elapsed_ms
,
)
finally
:
omni
.
close
()
_cleanup_distributed
()
# =============================================================================
# Correctness & Performance Tests
# =============================================================================
# SP configurations: (ulysses_degree, ring_degree, height, width, warmup, is_perf_test)
# - warmup: whether to run warmup for this SP config
# - is_perf_test: whether this is a performance test (show speedup metrics)
SP_CONFIGS
=
[
(
2
,
1
,
DEFAULT_HEIGHT
,
DEFAULT_WIDTH
,
True
,
True
),
# Ulysses-2 - performance test
(
1
,
2
,
DEFAULT_HEIGHT
,
DEFAULT_WIDTH
,
True
,
True
),
# Ring-2 - performance test
(
2
,
2
,
DEFAULT_HEIGHT
,
DEFAULT_WIDTH
,
False
,
False
),
# Hybrid - correctness only
(
4
,
1
,
272
,
272
,
False
,
False
),
# Ulysses-4 - shape and correctness
]
def
_get_sp_mode
(
ulysses_degree
:
int
,
ring_degree
:
int
)
->
str
:
"""Get SP mode name for logging."""
if
ulysses_degree
>
1
and
ring_degree
==
1
:
return
f
"ulysses-
{
ulysses_degree
}
"
elif
ring_degree
>
1
and
ulysses_degree
==
1
:
return
f
"ring-
{
ring_degree
}
"
else
:
return
f
"hybrid-
{
ulysses_degree
}
x
{
ring_degree
}
"
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS
)
def
test_sp_correctness
(
model_name
:
str
):
"""Test that SP inference produces correct outputs and measure performance.
Runs baseline once per unique (height, width), then tests all SP configs.
Note: Run with `pytest -v -s` to see detailed output.
"""
device_count
=
current_omni_platform
.
get_device_count
()
# Cache baseline results by (height, width)
# Key: (height, width), Value: (result, warmup_used)
baseline_cache
:
dict
[
tuple
[
int
,
int
],
InferenceResult
]
=
{}
# Collect results for summary
results
:
list
[
dict
]
=
[]
print
(
"
\n
"
+
"="
*
70
)
print
(
f
"Sequence Parallel Test - Model:
{
model_name
}
"
)
print
(
f
"Available GPUs:
{
device_count
}
"
)
print
(
"="
*
70
)
for
ulysses_degree
,
ring_degree
,
height
,
width
,
sp_warmup
,
is_perf_test
in
SP_CONFIGS
:
sp_size
=
ulysses_degree
*
ring_degree
sp_mode
=
_get_sp_mode
(
ulysses_degree
,
ring_degree
)
if
device_count
<
sp_size
:
print
(
f
"
\n
[
{
sp_mode
}
] SKIPPED (requires
{
sp_size
}
GPUs)"
)
continue
# Determine baseline warmup: only for default size (performance tests)
cache_key
=
(
height
,
width
)
baseline_warmup
=
height
==
DEFAULT_HEIGHT
and
width
==
DEFAULT_WIDTH
# Get or compute baseline for this (height, width)
if
cache_key
not
in
baseline_cache
:
print
(
f
"
\n
--- Running baseline
{
height
}
x
{
width
}
(warmup=
{
baseline_warmup
}
) ---"
)
baseline
=
_run_inference
(
model_name
,
torch
.
bfloat16
,
"sdpa"
,
height
=
height
,
width
=
width
,
warmup
=
baseline_warmup
,
)
assert
len
(
baseline
.
images
)
==
1
baseline_cache
[
cache_key
]
=
baseline
print
(
f
"[baseline]
{
height
}
x
{
width
}
:
{
baseline
.
elapsed_ms
:.
0
f
}
ms"
)
else
:
baseline
=
baseline_cache
[
cache_key
]
# Run SP
print
(
f
"
\n
--- Running
{
sp_mode
}
(warmup=
{
sp_warmup
}
) ---"
)
sp_result
=
_run_inference
(
model_name
,
torch
.
bfloat16
,
"sdpa"
,
ulysses_degree
=
ulysses_degree
,
ring_degree
=
ring_degree
,
height
=
height
,
width
=
width
,
warmup
=
sp_warmup
,
)
assert
len
(
sp_result
.
images
)
==
1
# Compare outputs (correctness)
mean_diff
,
max_diff
=
_diff_metrics
(
baseline
.
images
[
0
],
sp_result
.
images
[
0
])
# Build result entry
result
=
{
"mode"
:
sp_mode
,
"sp_size"
:
sp_size
,
"height"
:
height
,
"width"
:
width
,
"baseline_ms"
:
baseline
.
elapsed_ms
,
"sp_ms"
:
sp_result
.
elapsed_ms
,
"mean_diff"
:
mean_diff
,
"max_diff"
:
max_diff
,
"is_perf_test"
:
is_perf_test
,
}
results
.
append
(
result
)
# Output based on test type
if
is_perf_test
:
speedup
=
baseline
.
elapsed_ms
/
sp_result
.
elapsed_ms
if
sp_result
.
elapsed_ms
>
0
else
0
result
[
"speedup"
]
=
speedup
print
(
f
"[
{
sp_mode
}
]
{
sp_size
}
GPUs | "
f
"baseline:
{
baseline
.
elapsed_ms
:.
0
f
}
ms, sp:
{
sp_result
.
elapsed_ms
:.
0
f
}
ms, "
f
"speedup:
{
speedup
:.
2
f
}
x"
)
else
:
print
(
f
"[
{
sp_mode
}
]
{
sp_size
}
GPUs | sp:
{
sp_result
.
elapsed_ms
:.
0
f
}
ms (correctness only)"
)
print
(
f
"[
{
sp_mode
}
] diff: mean=
{
mean_diff
:.
6
e
}
, max=
{
max_diff
:.
6
e
}
"
)
# Assert correctness
assert
mean_diff
<=
DIFF_MEAN_THRESHOLD
and
max_diff
<=
DIFF_MAX_THRESHOLD
,
(
f
"[
{
sp_mode
}
] SP output differs from baseline: mean=
{
mean_diff
:.
6
e
}
, max=
{
max_diff
:.
6
e
}
"
)
# Summary
print
(
"
\n
"
+
"="
*
70
)
print
(
"SUMMARY"
)
print
(
"="
*
70
)
print
(
f
"
{
'Mode'
:
<
15
}
{
'GPUs'
:
<
6
}
{
'Size'
:
<
10
}
{
'Baseline'
:
<
12
}
{
'SP'
:
<
12
}
{
'Speedup'
:
<
10
}
{
'Status'
}
"
)
print
(
"-"
*
70
)
for
r
in
results
:
speedup_str
=
f
"
{
r
[
'speedup'
]:.
2
f
}
x"
if
r
.
get
(
"speedup"
)
else
"N/A"
baseline_str
=
f
"
{
r
[
'baseline_ms'
]:.
0
f
}
ms"
if
r
[
"is_perf_test"
]
else
"N/A"
status
=
"PASS"
if
r
[
"mean_diff"
]
<=
DIFF_MEAN_THRESHOLD
else
"FAIL"
print
(
f
"
{
r
[
'mode'
]:
<
15
}
{
r
[
'sp_size'
]:
<
6
}
{
r
[
'height'
]
}
x
{
r
[
'width'
]:
<
5
}
"
f
"
{
baseline_str
:
<
12
}
{
r
[
'sp_ms'
]:.
0
f
}
ms
{
''
:
<
7
}
{
speedup_str
:
<
10
}
{
status
}
"
)
print
(
"="
*
70
)
tests/e2e/offline_inference/test_stable_audio_model.py
0 → 100644
View file @
c1cacde6
import
sys
from
pathlib
import
Path
import
numpy
as
np
import
pytest
import
torch
from
vllm_omni.inputs.data
import
OmniDiffusionSamplingParams
from
vllm_omni.outputs
import
OmniRequestOutput
# ruff: noqa: E402
REPO_ROOT
=
Path
(
__file__
).
resolve
().
parents
[
2
]
if
str
(
REPO_ROOT
)
not
in
sys
.
path
:
sys
.
path
.
insert
(
0
,
str
(
REPO_ROOT
))
from
vllm_omni
import
Omni
# Use random weights model for CI testing (small, no authentication required)
models
=
[
"linyueqian/stable_audio_random"
]
@
pytest
.
mark
.
parametrize
(
"model_name"
,
models
)
def
test_stable_audio_model
(
model_name
:
str
):
m
=
Omni
(
model
=
model_name
)
# Use minimal settings for testing
# Generate a short 2-second audio clip with minimal inference steps
audio_start_in_s
=
0.0
audio_end_in_s
=
2.0
# Short duration for fast testing
sample_rate
=
44100
# Stable Audio uses 44100 Hz
outputs
=
m
.
generate
(
prompts
=
{
"prompt"
:
"The sound of a dog barking"
,
"negative_prompt"
:
"Low quality."
,
},
sampling_params_list
=
OmniDiffusionSamplingParams
(
num_inference_steps
=
4
,
# Minimal steps for speed
guidance_scale
=
7.0
,
generator
=
torch
.
Generator
(
"cuda"
).
manual_seed
(
42
),
num_outputs_per_prompt
=
1
,
extra_args
=
{
"audio_start_in_s"
:
audio_start_in_s
,
"audio_end_in_s"
:
audio_end_in_s
,
},
),
)
# Extract audio from OmniRequestOutput
assert
outputs
is
not
None
first_output
=
outputs
[
0
]
assert
first_output
.
final_output_type
==
"image"
assert
hasattr
(
first_output
,
"request_output"
)
and
first_output
.
request_output
req_out
=
first_output
.
request_output
[
0
]
assert
isinstance
(
req_out
,
OmniRequestOutput
)
assert
req_out
.
final_output_type
==
"audio"
assert
hasattr
(
req_out
,
"multimodal_output"
)
and
req_out
.
multimodal_output
audio
=
req_out
.
multimodal_output
.
get
(
"audio"
)
assert
isinstance
(
audio
,
np
.
ndarray
)
# audio shape: (batch, channels, samples)
# For stable-audio-open-1.0: sample_rate=44100, so 2 seconds = 88200 samples
assert
audio
.
ndim
==
3
assert
audio
.
shape
[
0
]
==
1
# batch size
assert
audio
.
shape
[
1
]
==
2
# stereo channels
expected_samples
=
int
((
audio_end_in_s
-
audio_start_in_s
)
*
sample_rate
)
assert
audio
.
shape
[
2
]
==
expected_samples
# 88200 samples for 2 seconds
tests/e2e/offline_inference/test_t2i_model.py
0 → 100644
View file @
c1cacde6
import
os
import
sys
from
pathlib
import
Path
import
pytest
import
torch
from
vllm_omni.inputs.data
import
OmniDiffusionSamplingParams
from
vllm_omni.outputs
import
OmniRequestOutput
from
vllm_omni.platforms
import
current_omni_platform
# ruff: noqa: E402
REPO_ROOT
=
Path
(
__file__
).
resolve
().
parents
[
2
]
if
str
(
REPO_ROOT
)
not
in
sys
.
path
:
sys
.
path
.
insert
(
0
,
str
(
REPO_ROOT
))
from
vllm_omni
import
Omni
os
.
environ
[
"VLLM_TEST_CLEAN_GPU_MEMORY"
]
=
"1"
models
=
[
"Tongyi-MAI/Z-Image-Turbo"
,
"riverclouds/qwen_image_random"
]
# Modelscope can't find riverclouds/qwen_image_random
# TODO: When NPU support is ready, remove this branch.
if
current_omni_platform
.
is_npu
():
models
=
[
"Tongyi-MAI/Z-Image-Turbo"
,
"Qwen/Qwen-Image"
]
elif
current_omni_platform
.
is_rocm
():
# TODO: When ROCm support is ready, remove this branch.
# vLLM V0.11.0 has issues running riverclouds/qwen_image_random
# on ROCm
models
=
[
"Tongyi-MAI/Z-Image-Turbo"
]
@
pytest
.
mark
.
parametrize
(
"model_name"
,
models
)
def
test_diffusion_model
(
model_name
:
str
):
m
=
None
try
:
m
=
Omni
(
model
=
model_name
)
# high resolution may cause OOM on L4
height
=
256
width
=
256
outputs
=
m
.
generate
(
"a photo of a cat sitting on a laptop keyboard"
,
OmniDiffusionSamplingParams
(
height
=
height
,
width
=
width
,
num_inference_steps
=
2
,
guidance_scale
=
0.0
,
generator
=
torch
.
Generator
(
"cuda"
).
manual_seed
(
42
),
num_outputs_per_prompt
=
2
,
),
)
# Extract images from request_output[0]['images']
first_output
=
outputs
[
0
]
assert
first_output
.
final_output_type
==
"image"
if
not
hasattr
(
first_output
,
"request_output"
)
or
not
first_output
.
request_output
:
raise
ValueError
(
"No request_output found in OmniRequestOutput"
)
req_out
=
first_output
.
request_output
[
0
]
if
not
isinstance
(
req_out
,
OmniRequestOutput
)
or
not
hasattr
(
req_out
,
"images"
):
raise
ValueError
(
"Invalid request_output structure or missing 'images' key"
)
images
=
req_out
.
images
assert
len
(
images
)
==
2
# check image size
assert
images
[
0
].
width
==
width
assert
images
[
0
].
height
==
height
images
[
0
].
save
(
"image_output.png"
)
except
Exception
as
e
:
print
(
f
"Test failed with error:
{
e
}
"
)
raise
finally
:
if
m
is
not
None
and
hasattr
(
m
,
"close"
):
m
.
close
()
Prev
1
…
7
8
9
10
11
12
13
14
15
16
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment