Merge remote-tracking branch 'origin/v0.11.0-dev' into v0.11.0-dev

0da696a7 · 王敏 · 82c0bf76 · 6fa116fb · 0da696a7 · 0da696a7
Commit 0da696a7 authored Jan 20, 2026 by 王敏
20 changed files
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -352,13 +352,19 @@ Full example: <gh-file:examples/offline_inference/audio_language.py>
 To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
 pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.

+You must enable this feature via `enable_mm_embeds=True`.
+
+!!! warning
+    The vLLM engine may crash if incorrect shape of embeddings is passed.
+    Only enable this flag for trusted users!
+
 ??? code

    ```python
    from vllm import LLM

    # Inference with image embeddings as input
-    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf", enable_mm_embeds=True)

    # Refer to the HuggingFace repo for the correct format to use
    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
@@ -390,7 +396,11 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
    image_embeds = torch.load(...)

    # Qwen2-VL
-    llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
+    llm = LLM(
+        "Qwen/Qwen2-VL-2B-Instruct",
+        limit_mm_per_prompt={"image": 4},
+        enable_mm_embeds=True,
+    )
    mm_data = {
        "image": {
            "image_embeds": image_embeds,
@@ -400,7 +410,12 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
    }

    # MiniCPM-V
-    llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
+    llm = LLM(
+        "openbmb/MiniCPM-V-2_6",
+        trust_remote_code=True,
+        limit_mm_per_prompt={"image": 4},
+        enable_mm_embeds=True,
+    )
    mm_data = {
        "image": {
            "image_embeds": image_embeds,
@@ -720,7 +735,13 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for
 ### Embedding Inputs

 To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
-pass a tensor of shape to the corresponding field of the multi-modal dictionary.
+pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
+
+You must enable this feature via the `--enable-mm-embeds` flag in `vllm serve`.
+
+!!! warning
+    The vLLM engine may crash if incorrect shape of embeddings is passed.
+    Only enable this flag for trusted users!

 #### Image Embedding Inputs


--- a/docs/features/prompt_embeds.md
+++ b/docs/features/prompt_embeds.md
@@ -20,12 +20,16 @@ You can pass prompt embeddings from Hugging Face Transformers models to the  `'p

 ## Online Serving

-Our OpenAI-compatible server accepts prompt embeddings inputs via the [Completions API](https://platform.openai.com/docs/api-reference/completions). Prompt embeddings inputs are added via a new `'prompt_embeds'` key in the JSON package.
+Our OpenAI-compatible server accepts prompt embeddings inputs via the [Completions API](https://platform.openai.com/docs/api-reference/completions). Prompt embeddings inputs are added via a new `'prompt_embeds'` key in the JSON package and are enabled by the `--enable-prompt-embeds` flag in `vllm serve`.

 When a mixture of `'prompt_embeds'` and `'prompt'` inputs are provided in a single request, the prompt embeds are always returned first.

 Prompt embeddings are passed in as base64 encoded torch tensors.

+!!! warning
+    The vLLM engine may crash if incorrect shape of embeddings is passed.
+    Only enable this flag for trusted users!
+
 ### Transformers Inputs via OpenAI Client

 First, launch the OpenAI-compatible server:

--- a/examples/offline_inference/prithvi_geospatial_mae.py
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
@@ -50,6 +50,7 @@ class PrithviMAE:
            dtype="float16",
            enforce_eager=True,
            model_impl="terratorch",
+            enable_mm_embeds=True,
        )

    def run(self, input_data, location_coords):

--- a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
+++ b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
@@ -38,6 +38,7 @@ def main():
        max_num_seqs=32,
        io_processor_plugin="prithvi_to_tiff",
        model_impl="terratorch",
+        enable_mm_embeds=True,
    )

    pooling_params = PoolingParams(task="encode", softmax=False)

--- a/examples/online_serving/prithvi_geospatial_mae.py
+++ b/examples/online_serving/prithvi_geospatial_mae.py
@@ -19,6 +19,7 @@ import requests
 #   --task embed --trust-remote-code
 #   --skip-tokenizer-init --enforce-eager
 #   --io-processor-plugin prithvi_to_tiff
+#   --enable-mm-embeds


 def main():

--- a/setup.py
+++ b/setup.py
@@ -509,9 +509,9 @@ def get_version_add(sha: Optional[str] = None) -> str:
        if sha != 'Unknown':
            if sha is None:
                sha = get_sha(vllm_root)
-            version = 'das.opt1.beta.' + sha[:7]
+            version = 'das.opt1.rc1.' + sha[:7]
    else:
-        version = 'das.opt1.beta'
+        version = 'das.opt1.rc1'


    # dtk version

--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import pytest
+import torch
 import os

 from vllm import LLM
@@ -14,8 +15,22 @@ def test_empty_prompt():
        llm.generate([""])


-@pytest.mark.skip_v1
 def test_out_of_vocab_token():
    llm = LLM(model=os.path.join(models_path_prefix, "openai-community/gpt2"), enforce_eager=True)
    with pytest.raises(ValueError, match='out of vocabulary'):
        llm.generate({"prompt_token_ids": [999999]}) 
+
+
+def test_require_mm_embeds():
+    llm = LLM(
+        model="llava-hf/llava-1.5-7b-hf",
+        enforce_eager=True,
+        enable_mm_embeds=False,
+    )
+    with pytest.raises(ValueError, match="--enable-mm-embeds"):
+        llm.generate(
+            {
+                "prompt": "<image>",
+                "multi_modal_data": {"image": torch.empty(1, 1, 1)},
+            }
+        )
--- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
+++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
@@ -263,3 +263,16 @@ async def test_prompt_logprobs_raises_error(
                "prompt_logprobs": True
            },
        )
+        
+
+@pytest.mark.asyncio
+async def test_empty_prompt_embeds(
+    client_with_prompt_embeds: openai.AsyncOpenAI,
+) -> None:
+    await client_with_prompt_embeds.completions.create(
+        model=MODEL_NAME,
+        prompt="Hello",
+        max_tokens=5,
+        temperature=0.0,
+        extra_body={"prompt_embeds": []},
+    )
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import io
+from unittest.mock import Mock
 import os
 # imports for structured outputs tests
 import openai
@@ -10,7 +11,8 @@ import pytest
 import regex as re
 import torch

-from vllm.entrypoints.renderer import BaseRenderer
+from vllm.config import ModelConfig
+from vllm.entrypoints.renderer import CompletionRenderer

 from ...utils import RemoteOpenAIServer, models_path_prefix

@@ -63,6 +65,9 @@ async def test_out_of_vocab_token_ids():
 @pytest.mark.parametrize("hidden_size", [2, 10])
 def test_load_prompt_embeds(dtype: torch.dtype, layout: torch.layout,
                            seq_len: int, hidden_size: int):
+    model_config = Mock(spec=ModelConfig)
+    model_config.enable_prompt_embeds = True
+    renderer = CompletionRenderer(model_config, tokenizer=None)
    # construct arbitrary tensors of various dtypes, layouts, and sizes.
    # We need to check against different layouts to make sure that if a user
    # uses sparse tensors to reduce the transmission size of prompt embeddings,
@@ -87,7 +92,7 @@ def test_load_prompt_embeds(dtype: torch.dtype, layout: torch.layout,
    buffer.seek(0)
    encoded_tensor = pybase64.b64encode(buffer.getvalue())

-    loaded_prompt_embeds = BaseRenderer.load_prompt_embeds(encoded_tensor)
+    loaded_prompt_embeds = renderer.load_prompt_embeds(encoded_tensor)
    assert len(loaded_prompt_embeds) == 1
    loaded_tensor = loaded_prompt_embeds[0]["prompt_embeds"]
    assert loaded_tensor.device.type == "cpu"
@@ -95,3 +100,22 @@ def test_load_prompt_embeds(dtype: torch.dtype, layout: torch.layout,
    torch.testing.assert_close(loaded_tensor,
                               tensor.to("cpu").to_dense(),
                               equal_nan=True)
+    
+
+@pytest.mark.parametrize("dtype", [torch.float32])
+@pytest.mark.parametrize("seq_len", [2])
+@pytest.mark.parametrize("hidden_size", [2])
+def test_disable_prompt_embeds(dtype: torch.dtype, seq_len: int, hidden_size: int):
+    model_config = Mock(spec=ModelConfig)
+    model_config.enable_prompt_embeds = False
+    renderer = CompletionRenderer(model_config, tokenizer=None)
+
+    tensor = torch.randn((seq_len, hidden_size), dtype=dtype)
+
+    buffer = io.BytesIO()
+    torch.save(tensor, buffer)
+    buffer.seek(0)
+    encoded_tensor = pybase64.b64encode(buffer.getvalue())
+
+    with pytest.raises(ValueError, match="--enable-prompt-embeds"):
+        renderer.load_prompt_embeds(encoded_tensor)
--- a/tests/entrypoints/openai/test_skip_tokenizer.py
+++ b/tests/entrypoints/openai/test_skip_tokenizer.py
@@ -15,31 +15,7 @@ MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
 DTYPE = "float16"


-@pytest.fixture(scope="module")
-def server():
-    args = [
-        "--runner",
-        "pooling",
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        DTYPE,
-        "--enforce-eager",
-        "--trust-remote-code",
-        "--skip-tokenizer-init",
-        "--max-num-seqs",
-        "32",
-        "--model-impl",
-        "terratorch"
-    ]
-
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-        yield remote_server
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_single_request(server: RemoteOpenAIServer, model_name: str):
-
+def _terratorch_dummy_inputs(model_name: str):
    pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
    location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)

@@ -55,7 +31,7 @@ async def test_single_request(server: RemoteOpenAIServer, model_name: str):
    binary_data = buffer_coord.read()
    base64_coord_embedding = base64.b64encode(binary_data).decode('utf-8')

-    prompt = {
+    return {
        "model":
        model_name,
        "additional_data": {
@@ -76,12 +52,34 @@ async def test_single_request(server: RemoteOpenAIServer, model_name: str):
        }]
    }

+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_request(model_name: str):
+    args = [
+        "--runner",
+        "pooling",
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        DTYPE,
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--max-num-seqs",
+        "32",
+        "--model-impl",
+        "terratorch",
+        "--skip-tokenizer-init",
+        "--enable-mm-embeds",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as server:
+        prompt = _terratorch_dummy_inputs(model_name)
+
        # test single pooling
        response = requests.post(server.url_for("pooling"), json=prompt)
        response.raise_for_status()

-    output = response.json()["data"][0]['data']
+        output = response.json()["data"][0]["data"]
        
        np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32)
-
        assert len(np_response) == 524288
+
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -72,6 +72,19 @@ def phi3v_model_config_mm_interleaved():
    )


+@pytest.fixture(scope="function")
+def phi3v_model_config_image_embeds():
+    return ModelConfig(
+        PHI3V_MODEL_ID,
+        runner="generate",
+        trust_remote_code=True,
+        limit_mm_per_prompt={
+            "image": 2,
+        },
+        enable_mm_embeds=True,
+    )
+    
+    
 @pytest.fixture(scope="module")
 def phi3v_tokenizer():
    return get_tokenizer(PHI3V_MODEL_ID)
@@ -895,7 +908,7 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(


 def test_parse_chat_messages_empty_image_embeds_with_uuid(
-    phi3v_model_config,
+    phi3v_model_config_image_embeds,
    phi3v_tokenizer,
 ):
    uuid = "abcd"
@@ -915,7 +928,7 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
                },
            ],
        }],
-        phi3v_model_config,
+        phi3v_model_config_image_embeds,
        phi3v_tokenizer,
        content_format="string",
    )
@@ -932,7 +945,7 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(

 @pytest.mark.asyncio
 async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
-    phi3v_model_config,
+    phi3v_model_config_image_embeds,
    phi3v_tokenizer,
 ):
    uuid = "abcd"
@@ -952,7 +965,7 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
                },
            ],
        }],
-        phi3v_model_config,
+        phi3v_model_config_image_embeds,
        phi3v_tokenizer,
        content_format="string",
    )

--- a/tests/entrypoints/test_renderer.py
+++ b/tests/entrypoints/test_renderer.py
@@ -18,6 +18,7 @@ from vllm.inputs.data import is_embeds_prompt
 class MockModelConfig:
    max_model_len: int = 100
    encoder_config: Optional[dict] = None
+    enable_prompt_embeds: bool = True


 class MockTokenizerResult:

--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -102,7 +102,7 @@ VLM_TEST_SETTINGS = {
            limit_mm_per_prompt={"image": 4},
        )],
        # TODO: Revert to "auto" when CPU backend can use torch > 2.6
-        dtype="bfloat16" if current_platform.is_cpu() else "auto",
+        vllm_runner_kwargs={"enable_mm_embeds": True},
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
    ),
    "paligemma": VLMTestInfo(

--- a/tests/models/multimodal/generation/test_qwen2_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_vl.py
@@ -277,6 +277,7 @@ def run_embedding_input_test(
            tensor_parallel_size=tensor_parallel_size,
            distributed_executor_backend=distributed_executor_backend,
            default_torch_num_threads=1,
+            enable_mm_embeds=True,
    ) as vllm_model:
        outputs_per_case_for_original_input = [
            vllm_model.generate_greedy_logprobs(prompts,

--- a/tests/models/multimodal/pooling/test_prithvi_mae.py
+++ b/tests/models/multimodal/pooling/test_prithvi_mae.py
@@ -34,6 +34,7 @@ def _run_test(
            dtype="half",
            enforce_eager=True,
            skip_tokenizer_init=True,
+            enable_mm_embeds=True,
            # Limit the maximum number of sequences to avoid the
            # test going OOM during the warmup run
            max_num_seqs=32,

--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -90,6 +90,11 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
            m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
        if model_arch == "WhisperForConditionalGeneration":
            m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+            
+        extra_args = {}
+        if model_arch in ("PrithviGeoSpatialMAE", "Terratorch"):
+            extra_args["enable_mm_embeds"] = True
+            
        LLM(
            model_info.default,
            tokenizer=model_info.tokenizer,
@@ -110,7 +115,8 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
            model_impl="transformers"
            if model_arch in _TRANSFORMERS_BACKEND_MODELS else "vllm",
            hf_overrides=hf_overrides_fn,
-            max_num_seqs=model_info.max_num_seqs)
+            max_num_seqs=model_info.max_num_seqs,
+            **extra_args)


 @pytest.mark.parametrize("model_arch", MINIMAL_MODEL_ARCH_LIST)

--- a/tests/models/test_terratorch.py
+++ b/tests/models/test_terratorch.py
@@ -30,6 +30,7 @@ def test_inference(
            dtype="half",
            enforce_eager=True,
            skip_tokenizer_init=True,
+            enable_mm_embeds=True,
            # Limit the maximum number of sequences to avoid the
            # test going OOM during the warmup run
            max_num_seqs=32,

--- a/tests/plugins_tests/test_io_processor_plugins.py
+++ b/tests/plugins_tests/test_io_processor_plugins.py
@@ -38,6 +38,7 @@ def server():
        "prithvi_to_tiff",
        "--model-impl",
        "terratorch",
+        "--enable-mm-embeds",
    ]

    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:

--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -7,7 +7,6 @@ import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
 import regex as re
-import requests
 from openai import BadRequestError

 from tests.utils import RemoteOpenAIServer
@@ -688,17 +687,3 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
                }
            },
        )
-
-
-@pytest.mark.asyncio
-async def test_completion_with_empty_prompt_embeds(
-        client: openai.AsyncOpenAI) -> None:
-    """Test completion with empty prompt embeds."""
-    payload: dict[str, object] = {"prompt": "Hello", "prompt_embeds": []}
-    headers: dict[str, str] = {"Content-Type": "application/json"}
-    # base_url = http://localhost:8000/v1/completions
-    response = requests.post(f"{client.base_url}completions",
-                             headers=headers,
-                             json=payload)
-    assert response.status_code == 200, (
-        f"Expected status code 200, got {response.status_code}. ")
--- a/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py
+++ b/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py
@@ -31,6 +31,7 @@ def default_image_embeds_server_args() -> list[str]:
        "4",
        "--enforce-eager",
        "--limit-mm-per-prompt",
+        "--enable-mm-embeds",
        json.dumps({"image": MAXIMUM_IMAGES}),
    ]