[Misc] Algin Qwen3-VL-embedding image example outputs with HF repo example (#33419)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>

[Misc] Algin Qwen3-VL-embedding image example outputs with HF repo example (#33419)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
9df152bb · Isotr0py · GitHub · 876a16f4 · 9df152bb
Unverified Commit 9df152bb authored Jan 31, 2026 by Isotr0py Committed by GitHub Jan 30, 2026
Show whitespace changes
Inline Side-by-side

Showing with 28 additions and 1 deletion

examples/pooling/embed/vision_embedding_offline.py examples/pooling/embed/vision_embedding_offline.py +28 -1

No files found.
--- a/examples/pooling/embed/vision_embedding_offline.py
+++ b/examples/pooling/embed/vision_embedding_offline.py
@@ -12,6 +12,8 @@ on HuggingFace model repository.
 import argparse
 from dataclasses import asdict
+from PIL.Image import Image
 from vllm import LLM, EngineArgs
 from vllm.multimodal.utils import fetch_image
@@ -20,17 +22,42 @@ text = "A cat standing in the snow."
 multi_modal_data = {"image": fetch_image(image_url)}
-def print_embeddings(embeds):
+def print_embeddings(embeds: list[float]):
    embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
    print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
 def run_qwen3_vl():
+    try:
+        from qwen_vl_utils import smart_resize
+    except ModuleNotFoundError:
+        print(
+            "WARNING: `qwen-vl-utils` not installed, input images will not "
+            "be automatically resized. This can cause different results "
+            "comparing with HF repo's example. "
+            "You can enable this functionality by `pip install qwen-vl-utils`."
+        )
+        smart_resize = None
+    if smart_resize is not None:
+        def post_process_image(image: Image) -> Image:
+            width, height = image.size
+            resized_height, resized_width = smart_resize(
+                height,
+                width,
+                factor=32,
+            )
+            return image.resize((resized_width, resized_height))
+        multi_modal_data["image"] = post_process_image(multi_modal_data["image"])
    engine_args = EngineArgs(
        model="Qwen/Qwen3-VL-Embedding-2B",
        runner="pooling",
        max_model_len=8192,
        limit_mm_per_prompt={"image": 1},
+        mm_processor_kwargs={"do_resize": False} if smart_resize is not None else None,
    )
    default_instruction = "Represent the user's input."
    image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"