fix: restrict dummy embedding value range for bypassing vLLM check in E/P/D (#7117)

Signed-off-by: Guan Luo <41310872+GuanLuo@users.noreply.github.com>

fix: restrict dummy embedding value range for bypassing vLLM check in E/P/D (#7117)
Signed-off-by: Guan Luo <41310872+GuanLuo@users.noreply.github.com>
2cc92bfa · GuanLuo · GitHub · a274ef82 · 2cc92bfa · 2cc92bfa
Unverified Commit 2cc92bfa authored Mar 09, 2026 by GuanLuo Committed by GitHub Mar 09, 2026
2 changed files
--- a/components/src/dynamo/vllm/multimodal_utils/model.py
+++ b/components/src/dynamo/vllm/multimodal_utils/model.py
@@ -272,7 +272,15 @@ def construct_qwen_decode_mm_data(
    # This prevents prefix cache from incorrectly matching different images
    # that happen to have the same dimensions (same image_grid_thw).
    # bit ops to convert request ID to somewhat unique value that fits in the dtype range
-    fill_value = hash(request_id) & ((1 << (dtype.itemsize * 8)) - 1)
+    if not hasattr(construct_qwen_decode_mm_data, "_counter"):
+        construct_qwen_decode_mm_data._counter = 0
+    fill_value = construct_qwen_decode_mm_data._counter
+    construct_qwen_decode_mm_data._counter += 1
+    max_val = (
+        torch.finfo(dtype).max if dtype.is_floating_point else torch.iinfo(dtype).max
+    )
+    if construct_qwen_decode_mm_data._counter > max_val:
+        construct_qwen_decode_mm_data._counter = 0
    image_embeds = torch.full(
        embeddings_shape, fill_value=fill_value, dtype=dtype, device="cpu"
    )

--- a/components/src/dynamo/vllm/tests/multimodal_utils/test_vllm_model.py
+++ b/components/src/dynamo/vllm/tests/multimodal_utils/test_vllm_model.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Unit tests for DynamoMultimodalEmbeddingCacheConnector."""
+import pytest
+import torch
+from dynamo.vllm.multimodal_utils.model import construct_qwen_decode_mm_data
+pytestmark = [
+    pytest.mark.pre_merge,
+    pytest.mark.vllm,
+    pytest.mark.gpu_0,
+    pytest.mark.multimodal,
+]
+class TestMultiModalUtils:
+    def test_construct_qwen_decode_mm_data(self):
+        max_rounds = int(torch.finfo(torch.float16).max) + 2
+        expected_image_grid_thw_tensor = torch.tensor([16, 16])
+        for i in range(max_rounds):
+            # Should not raise any exception
+            try:
+                mm_data = construct_qwen_decode_mm_data(
+                    image_grid_thw=[16, 16],
+                    embeddings_shape=[2, 1024],
+                    request_id=str(i),
+                )
+            except Exception as e:
+                pytest.fail(
+                    f"construct_qwen_decode_mm_data raised {type(e).__name__} on round {i}: {e}"
+                )
+            assert "image" in mm_data
+            assert "image_grid_thw" in mm_data["image"]
+            assert "image_embeds" in mm_data["image"]
+            assert torch.allclose(
+                mm_data["image"]["image_grid_thw"], expected_image_grid_thw_tensor
+            )
+            # Embedding values are randomly genearted as placehodler, we only check the shape
+            assert mm_data["image"]["image_embeds"].shape == (2, 1024)