Merge tag 'v0.8.2' into v0.8.2-dev

469e903b · zhuwenwen · 389ebcf7 · 25f560a6 · 469e903b · 469e903b
Commit 469e903b authored Mar 28, 2025 by zhuwenwen
20 changed files
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -36,8 +36,7 @@ def _validate_image_max_tokens_one(
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 def test_processor_max_tokens(model_id):
    ctx = build_model_context(
-        model_name=model_id,
-        tokenizer_name=model_id,
+        model_id,
        mm_processor_kwargs=None,
        limit_mm_per_prompt={"image": 1},
    )
@@ -136,8 +135,7 @@ def _test_image_prompt_replacements(
 @pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_prompt_replacements_regression(model_id, num_imgs):
    ctx = build_model_context(
-        model_name=model_id,
-        tokenizer_name=model_id,
+        model_id,
        mm_processor_kwargs=None,
        limit_mm_per_prompt={"image": num_imgs},
    )
@@ -166,8 +164,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
 @pytest.mark.parametrize("num_imgs", [1])
 def test_processor_prompt_replacements_all(model_id, num_imgs):
    ctx = build_model_context(
-        model_name=model_id,
-        tokenizer_name=model_id,
+        model_id,
        mm_processor_kwargs=None,
        limit_mm_per_prompt={"image": num_imgs},
    )

--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -37,8 +37,7 @@ def _validate_image_max_tokens_one(
                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
 def test_processor_max_tokens(model_id):
    ctx = build_model_context(
-        model_name=model_id,
-        tokenizer_name=model_id,
+        model_id,
        mm_processor_kwargs=None,
        limit_mm_per_prompt={"image": 1},
    )
@@ -136,8 +135,7 @@ def _test_image_prompt_replacements(
 @pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_prompt_replacements_regression(model_id, num_imgs):
    ctx = build_model_context(
-        model_name=model_id,
-        tokenizer_name=model_id,
+        model_id,
        mm_processor_kwargs=None,
        limit_mm_per_prompt={"image": num_imgs},
    )
@@ -167,8 +165,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
 @pytest.mark.parametrize("num_imgs", [1])
 def test_processor_prompt_replacements_all(model_id, num_imgs):
    ctx = build_model_context(
-        model_name=model_id,
-        tokenizer_name=model_id,
+        model_id,
        mm_processor_kwargs=None,
        limit_mm_per_prompt={"image": num_imgs},
    )

--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -35,9 +35,7 @@ def test_processor_override(
    from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID

    ctx = build_model_context(
-        model_name=model_id,
-        tokenizer_name=model_id,
-        trust_remote_code=True,
+        model_id,
        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
        limit_mm_per_prompt={"image": num_imgs},
    )

--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -30,8 +30,7 @@ def test_processor_override(
 ):
    """Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
    ctx = build_model_context(
-        model_name=model_id,
-        tokenizer_name=model_id,
+        model_id,
        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
        limit_mm_per_prompt={"image": num_imgs},
    )

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
 # SPDX-License-Identifier: Apache-2.0

+from collections.abc import Mapping, Set
 from dataclasses import dataclass, field
-from typing import AbstractSet, Any, Literal, Mapping, Optional
+from typing import Any, Literal, Optional

 import pytest
 from packaging.version import Version
@@ -123,6 +124,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
    "GemmaForCausalLM": _HfExamplesInfo("google/gemma-2b"),
    "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
+    "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it",
+                                         min_transformers_version="4.50"),
    "GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"),
    "GPT2LMHeadModel": _HfExamplesInfo("gpt2"),
    "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder"),
@@ -130,6 +133,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-160m"),
    "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
    "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
+    "GraniteMoeSharedForCausalLM": _HfExamplesInfo("ibm-research/moe-7b-1b-active-shared-experts",  # noqa: E501
+                                                   min_transformers_version="4.49"),  # noqa: E501
+    "Grok1ModelForCausalLM": _HfExamplesInfo("hpcai-tech/grok-1",
+                                             trust_remote_code=True),
    "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
                                           trust_remote_code=True),
    "InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b",
@@ -185,17 +192,16 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
    "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
                                            trust_remote_code=True),
+    "TeleFLMForCausalLM": _HfExamplesInfo("CofeAI/FLM-2-52B-Instruct-2407",
+                                            trust_remote_code=True),
    "XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat",
                                         is_available_online=False,
                                         trust_remote_code=True),
+    "Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct",
+                                         min_transformers_version="4.49"),
    # [Encoder-decoder]
    "BartModel": _HfExamplesInfo("facebook/bart-base"),
    "BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"),
-    # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
-    # Therefore, we borrow the BartTokenizer from the original Bart model
-    "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base",  # noqa: E501
-                                                         tokenizer="facebook/bart-base",
-                                                         trust_remote_code=True),  # noqa: E501
 }

 _EMBEDDING_EXAMPLE_MODELS = {
@@ -214,7 +220,7 @@ _EMBEDDING_EXAMPLE_MODELS = {
    "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"),  # noqa: E501
    "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"),  # noqa: E501
    "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1"),  # noqa: E501
-    "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-large"),
+    "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small"),
    # [Multimodal]
    "LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"),
    "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full",
@@ -241,6 +247,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny",  # noqa: E501
                                               hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}),  # noqa: E501
    "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
+    "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it",
+                                                      min_transformers_version="4.50"),
    "GLM4VForCausalLM": _HfExamplesInfo("THUDM/glm-4v-9b",
                                        trust_remote_code=True,
                                        hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
@@ -252,7 +260,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
                                                        {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}),  # noqa: E501
    "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
-                                                     extras={"mistral": "mistral-community/pixtral-12b"}),  # noqa: E501
+                                                     extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501
+                                                             "mistral-fp8": "nm-testing/pixtral-12b-FP8-dynamic"}),  # noqa: E501
    "LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"),  # noqa: E501
    "LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"),  # noqa: E501
    "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),  # noqa: E501
@@ -271,6 +280,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224",  # noqa: E501
                                                         extras={"v2": "google/paligemma2-3b-ft-docci-448"}),  # noqa: E501
    "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
+                                        trust_remote_code=True,
+                                        extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"}),  # noqa: E501),
+    "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
                                        trust_remote_code=True),
    "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409",  # noqa: E501
                                                       tokenizer_mode="mistral"),
@@ -282,9 +294,14 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
    "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct",  # noqa: E501
                                                          min_transformers_version="4.49"),  # noqa: E501
-    "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",
+    "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
                                     trust_remote_code=True),
    # [Encoder-decoder]
+    # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
+    # Therefore, we borrow the BartTokenizer from the original Bart model
+    "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base",  # noqa: E501
+                                                         tokenizer="facebook/bart-base",
+                                                         trust_remote_code=True),  # noqa: E501
    "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501
    "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),  # noqa: E501
 }
@@ -321,7 +338,7 @@ class HfExampleModels:

        self.hf_models = hf_models

-    def get_supported_archs(self) -> AbstractSet[str]:
+    def get_supported_archs(self) -> Set[str]:
        return self.hf_models.keys()

    def get_hf_info(self, model_arch: str) -> _HfExamplesInfo:

--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -6,8 +6,9 @@ import pytest
 from transformers import PretrainedConfig

 from vllm import LLM
+from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
+from vllm.v1.engine.core import EngineCore as V1EngineCore

-from ..conftest import MODELS_ON_S3
 from .registry import HF_EXAMPLE_MODELS


@@ -37,17 +38,20 @@ def test_can_initialize(model_arch):
        return hf_config

    # Avoid calling model.forward()
-    def _initialize_kv_caches(self) -> None:
+    def _initialize_kv_caches_v0(self) -> None:
        self.cache_config.num_gpu_blocks = 0
        self.cache_config.num_cpu_blocks = 0

-    with patch.object(LLM.get_engine_class(), "_initialize_kv_caches",
-                      _initialize_kv_caches):
-        model_name = model_info.default
-        if model_name in MODELS_ON_S3:
-            model_name = f"s3://vllm-ci-model-weights/{model_name.split('/')[-1]}"
+    def _initalize_kv_caches_v1(self, vllm_config):
+        # gpu_blocks (> 0), cpu_blocks
+        return 1, 0
+
+    with (patch.object(V0LLMEngine, "_initialize_kv_caches",
+                       _initialize_kv_caches_v0),
+          patch.object(V1EngineCore, "_initialize_kv_caches",
+                       _initalize_kv_caches_v1)):
        LLM(
-            model_name,
+            model_info.default,
            tokenizer=model_info.tokenizer,
            tokenizer_mode=model_info.tokenizer_mode,
            speculative_model=model_info.speculative_model,

--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
 # SPDX-License-Identifier: Apache-2.0

-import os
-
 import pytest

 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset

-from ..utils import fork_new_process_for_each_test
-
-
-@fork_new_process_for_each_test
-def test_plugin(dummy_opt_path):
-    os.environ["VLLM_PLUGINS"] = ""
-    with pytest.raises(Exception) as excinfo:
-        LLM(model=dummy_opt_path, load_format="dummy")
-    error_msg = "has no vLLM implementation and " \
-                "the Transformers implementation is not compatible with vLLM."
-    assert (error_msg in str(excinfo.value))
-
-
-@fork_new_process_for_each_test
-def test_oot_registration_text_generation(dummy_opt_path):
-    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
-    prompts = ["Hello, my name is", "The text does not matter"]
-    sampling_params = SamplingParams(temperature=0)
-    llm = LLM(model=dummy_opt_path, load_format="dummy")
-    first_token = llm.get_tokenizer().decode(0)
-    outputs = llm.generate(prompts, sampling_params)
-
-    for output in outputs:
-        generated_text = output.outputs[0].text
-        # make sure only the first token is generated
-        rest = generated_text.replace(first_token, "")
-        assert rest == ""
-
-
-@fork_new_process_for_each_test
-def test_oot_registration_embedding(dummy_gemma2_embedding_path):
-    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
-    prompts = ["Hello, my name is", "The text does not matter"]
-    llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
-    outputs = llm.embed(prompts)
-
-    for output in outputs:
-        assert all(v == 0 for v in output.outputs.embedding)
+from ..utils import create_new_process_for_each_test
+
+
+@create_new_process_for_each_test()
+def test_plugin(
+    monkeypatch: pytest.MonkeyPatch,
+    dummy_opt_path: str,
+):
+    # V1 shuts down rather than raising an error here.
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        m.setenv("VLLM_PLUGINS", "")
+
+        with pytest.raises(Exception) as excinfo:
+            LLM(model=dummy_opt_path, load_format="dummy")
+        error_msg = "has no vLLM implementation and the Transformers implementation is not compatible with vLLM"  # noqa: E501
+        assert (error_msg in str(excinfo.value))
+
+
+@create_new_process_for_each_test()
+def test_oot_registration_text_generation(
+    monkeypatch: pytest.MonkeyPatch,
+    dummy_opt_path: str,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_PLUGINS", "register_dummy_model")
+        prompts = ["Hello, my name is", "The text does not matter"]
+        sampling_params = SamplingParams(temperature=0)
+        llm = LLM(model=dummy_opt_path, load_format="dummy")
+        first_token = llm.get_tokenizer().decode(0)
+        outputs = llm.generate(prompts, sampling_params)
+
+        for output in outputs:
+            generated_text = output.outputs[0].text
+            # make sure only the first token is generated
+            rest = generated_text.replace(first_token, "")
+            assert rest == ""
+
+
+@create_new_process_for_each_test()
+def test_oot_registration_embedding(
+    monkeypatch: pytest.MonkeyPatch,
+    dummy_gemma2_embedding_path: str,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_PLUGINS", "register_dummy_model")
+        prompts = ["Hello, my name is", "The text does not matter"]
+        llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
+        outputs = llm.embed(prompts)
+
+        for output in outputs:
+            assert all(v == 0 for v in output.outputs.embedding)


 image = ImageAsset("cherry_blossom").pil_image.convert("RGB")


-@fork_new_process_for_each_test
-def test_oot_registration_multimodal(dummy_llava_path):
-    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
-    prompts = [{
-        "prompt": "What's in the image?<image>",
-        "multi_modal_data": {
-            "image": image
-        },
-    }, {
-        "prompt": "Describe the image<image>",
-        "multi_modal_data": {
-            "image": image
-        },
-    }]
-
-    sampling_params = SamplingParams(temperature=0)
-    llm = LLM(model=dummy_llava_path,
-              load_format="dummy",
-              max_num_seqs=1,
-              trust_remote_code=True,
-              gpu_memory_utilization=0.98,
-              max_model_len=4096,
-              enforce_eager=True,
-              limit_mm_per_prompt={"image": 1})
-    first_token = llm.get_tokenizer().decode(0)
-    outputs = llm.generate(prompts, sampling_params)
-
-    for output in outputs:
-        generated_text = output.outputs[0].text
-        # make sure only the first token is generated
-        rest = generated_text.replace(first_token, "")
-        assert rest == ""
+@create_new_process_for_each_test()
+def test_oot_registration_multimodal(
+    monkeypatch: pytest.MonkeyPatch,
+    dummy_llava_path: str,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_PLUGINS", "register_dummy_model")
+        prompts = [{
+            "prompt": "What's in the image?<image>",
+            "multi_modal_data": {
+                "image": image
+            },
+        }, {
+            "prompt": "Describe the image<image>",
+            "multi_modal_data": {
+                "image": image
+            },
+        }]
+
+        sampling_params = SamplingParams(temperature=0)
+        llm = LLM(model=dummy_llava_path,
+                  load_format="dummy",
+                  max_num_seqs=1,
+                  trust_remote_code=True,
+                  gpu_memory_utilization=0.98,
+                  max_model_len=4096,
+                  enforce_eager=True,
+                  limit_mm_per_prompt={"image": 1})
+        first_token = llm.get_tokenizer().decode(0)
+        outputs = llm.generate(prompts, sampling_params)
+
+        for output in outputs:
+            generated_text = output.outputs[0].text
+            # make sure only the first token is generated
+            rest = generated_text.replace(first_token, "")
+            assert rest == ""
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -17,7 +17,7 @@ from vllm.model_executor.models.registry import (_MULTIMODAL_MODELS,
                                                 ModelRegistry)
 from vllm.platforms import current_platform

-from ..utils import fork_new_process_for_each_test
+from ..utils import create_new_process_for_each_test
 from .registry import HF_EXAMPLE_MODELS


@@ -45,7 +45,7 @@ def test_registry_imports(model_arch):
        assert supports_multimodal(model_cls)


-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 @pytest.mark.parametrize("model_arch,is_mm,init_cuda,is_ce", [
    ("LlamaForCausalLM", False, False, False),
    ("MllamaForConditionalGeneration", True, False, False),
@@ -70,7 +70,7 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
                stacklevel=2)


-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 @pytest.mark.parametrize("model_arch,is_pp,init_cuda", [
    ("MLPSpeculatorPreTrainedModel", False, False),
    ("DeepseekV2ForCausalLM", True, False),

--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -4,7 +4,6 @@
 Run `pytest tests/models/test_transformers.py`.
 """
 from contextlib import nullcontext
-from typing import Type

 import pytest

@@ -14,8 +13,8 @@ from .utils import check_logprobs_close


 def check_implementation(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
    example_prompts: list[str],
    model: str,
    **kwargs,
@@ -47,8 +46,8 @@ def check_implementation(
        ("ArthurZ/Ilama-3.2-1B", "auto"),  # CUSTOM CODE
    ])  # trust_remote_code=True by default
 def test_models(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
    example_prompts: list[str],
    model: str,
    model_impl: str,
@@ -71,8 +70,8 @@ def test_models(

 @multi_gpu_test(num_gpus=2)
 def test_distributed(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
    example_prompts,
 ):
    kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
@@ -92,7 +91,7 @@ def test_distributed(
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_quantization(
-    vllm_runner: Type[VllmRunner],
+    vllm_runner: type[VllmRunner],
    example_prompts: list[str],
    model: str,
    quantization_kwargs: dict[str, str],

--- a/tests/models/utils.py
+++ b/tests/models/utils.py
 # SPDX-License-Identifier: Apache-2.0

 import warnings
-from typing import Dict, List, Optional, Sequence, Tuple, Union
+from collections.abc import Sequence
+from typing import Any, Optional, Union

 import torch

@@ -9,7 +10,9 @@ from vllm.config import ModelConfig, TaskOption
 from vllm.inputs import InputContext
 from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs

-TokensText = Tuple[List[int], str]
+from .registry import HF_EXAMPLE_MODELS
+
+TokensText = tuple[list[int], str]


 def check_outputs_equal(
@@ -46,7 +49,7 @@ def check_outputs_equal(
 # * List of top sample logprobs for each sampled token
 #
 # Assumes prompt logprobs were not requested.
-TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int,
+TokensTextLogprobs = tuple[list[int], str, Optional[Union[list[dict[int,
                                                                    float]],
                                                          SampleLogprobs]]]

@@ -57,8 +60,8 @@ TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int,
 # * Optional list of top sample logprobs for each sampled token
 #
 # Assumes prompt logprobs were not requested.
-TextTextLogprobs = Tuple[List[str], str, Optional[Union[List[Dict[str, float]],
-                                                        List[Dict[str,
+TextTextLogprobs = tuple[list[str], str, Optional[Union[list[dict[str, float]],
+                                                        list[dict[str,
                                                                  Logprob]]]]]

 # Representation of generated sequence as a tuple of
@@ -68,9 +71,9 @@ TextTextLogprobs = Tuple[List[str], str, Optional[Union[List[Dict[str, float]],
 # * Optional list of top prompt logprobs for each prompt token
 #
 # Allows prompt logprobs to be requested.
-TokensTextLogprobsPromptLogprobs = Tuple[
-    List[int], str, Optional[Union[List[Dict[int, float]], SampleLogprobs]],
-    Optional[Union[List[Optional[Dict[int, float]]], PromptLogprobs]]]
+TokensTextLogprobsPromptLogprobs = tuple[
+    list[int], str, Optional[Union[list[dict[int, float]], SampleLogprobs]],
+    Optional[Union[list[Optional[dict[int, float]]], PromptLogprobs]]]


 def check_logprobs_close(
@@ -249,21 +252,17 @@ def check_logprobs_close(


 def build_model_context(
-    model_name: str,
+    model_id: str,
    task: TaskOption = "auto",
-    tokenizer_name: Optional[str] = None,
-    trust_remote_code: bool = False,
-    dtype: Optional[Union[str, torch.dtype]] = None,
-    mm_processor_kwargs: Optional[Dict] = None,
-    limit_mm_per_prompt: Optional[Dict] = None,
+    dtype: Union[str, torch.dtype] = "auto",
+    mm_processor_kwargs: Optional[dict[str, Any]] = None,
+    limit_mm_per_prompt: Optional[dict[str, int]] = None,
    disable_mm_preprocessor_cache: bool = True,
 ):
    """Creates an InputContext for a given model.

    Args:
-        model_name: Name of the model being considered.
-        tokenizer_name: Name of the tokenizer being considered.
-        trust_remote_code: Whether or not to allow loading remote code.
+        model_id: ID of the model being considered.
        mm_processor_kwargs: optional processor kwargs for to be leveraged
            in the input processor, mapper, dummy data creation, etc.
        limit_mm_per_prompt: Multimodal limits.
@@ -271,21 +270,21 @@ def build_model_context(
    Returns:
        InputContext for the model being considered.
    """
-    if tokenizer_name is None:
-        tokenizer_name = model_name
-    if dtype is None:
-        dtype = "half"
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")

    model_config = ModelConfig(
-        model_name,
+        model_id,
        task=task,
-        tokenizer=tokenizer_name,
-        tokenizer_mode="auto",
-        trust_remote_code=trust_remote_code,
+        tokenizer=model_info.tokenizer or model_id,
+        tokenizer_mode=model_info.tokenizer_mode,
+        trust_remote_code=model_info.trust_remote_code,
        dtype=dtype,
        seed=0,
        mm_processor_kwargs=mm_processor_kwargs,
        limit_mm_per_prompt=limit_mm_per_prompt,
        disable_mm_preprocessor_cache=disable_mm_preprocessor_cache,
+        hf_overrides=model_info.hf_overrides,
    )
    return InputContext(model_config)
--- a/tests/mq_llm_engine/conftest.py
+++ b/tests/mq_llm_engine/conftest.py
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
--- a/tests/mq_llm_engine/test_abort.py
+++ b/tests/mq_llm_engine/test_abort.py
@@ -13,7 +13,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from ..utils import models_path_prefix

 MODEL = os.path.join(models_path_prefix, "gemma-1.1-2b-it")
-ENGINE_ARGS = AsyncEngineArgs(model=MODEL, load_format="runai_streamer")
+ENGINE_ARGS = AsyncEngineArgs(model=MODEL)
 RAISED_ERROR = KeyError
 RAISED_VALUE = "foo"
 EXPECTED_TOKENS = 250

--- a/tests/mq_llm_engine/test_error_handling.py
+++ b/tests/mq_llm_engine/test_error_handling.py
@@ -19,14 +19,13 @@ from vllm.engine.multiprocessing.engine import MQLLMEngine
 from vllm.entrypoints.openai.api_server import build_async_engine_client
 from vllm.entrypoints.openai.cli_args import make_arg_parser
 from vllm.lora.request import LoRARequest
+from vllm.sequence import SequenceGroupMetadata
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser
 from ..utils import models_path_prefix

 MODEL = os.path.join(models_path_prefix, "gemma-1.1-2b-it")
-ENGINE_ARGS = AsyncEngineArgs(model=MODEL,
-                              load_format="runai_streamer",
-                              enforce_eager=True)
+ENGINE_ARGS = AsyncEngineArgs(model=MODEL, enforce_eager=True)
 RAISED_ERROR = KeyError
 RAISED_VALUE = "foo"

@@ -238,25 +237,28 @@ async def test_bad_request(tmp_socket):


 @pytest.mark.asyncio
-async def test_mp_crash_detection(monkeypatch):
+async def test_mp_crash_detection(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:

-    parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
-    parser = make_arg_parser(parser)
-    args = parser.parse_args([])
+        parser = FlexibleArgumentParser(
+            description="vLLM's remote OpenAI server.")
+        parser = make_arg_parser(parser)
+        args = parser.parse_args([])

-    # When LLMEngine is loaded, it will crash.
-    def mock_init():
-        raise ValueError
+        # When LLMEngine is loaded, it will crash.
+        def mock_init():
+            raise ValueError

-    monkeypatch.setattr(LLMEngine, "__init__", mock_init)
+        m.setattr(LLMEngine, "__init__", mock_init)

-    start = time.perf_counter()
-    async with build_async_engine_client(args):
-        pass
-    end = time.perf_counter()
+        start = time.perf_counter()
+        async with build_async_engine_client(args):
+            pass
+        end = time.perf_counter()

-    assert end - start < 60, ("Expected vLLM to gracefully shutdown in <60s "
-                              "if there is an error in the startup.")
+        assert end - start < 60, (
+            "Expected vLLM to gracefully shutdown in <60s "
+            "if there is an error in the startup.")


 @pytest.mark.asyncio
@@ -296,3 +298,80 @@ async def test_engine_process_death(tmp_socket):
            await client.check_health()

        client.close()
+
+
+def run_with_evil_input_processing(engine_args: AsyncEngineArgs,
+                                   ipc_path: str):
+    """Simulate an exception while preparing inputs for the model.
+    In the wild, this could be something like a multimodal input processor
+    failing on invalid image data."""
+
+    # Make engine.
+    engine = MQLLMEngine.from_engine_args(
+        engine_args=engine_args,
+        usage_context=UsageContext.UNKNOWN_CONTEXT,
+        ipc_path=ipc_path)
+
+    runner = engine.engine.model_executor.driver_worker.worker.model_runner
+
+    # Raise error in the model runner when adding a sequence group.
+    # See class ModelInputForGPUBuilder
+    def raiser(_, seq_group_metadata: SequenceGroupMetadata):
+        if seq_group_metadata.request_id.startswith("evil"):
+            raise RAISED_ERROR(RAISED_VALUE)
+
+    runner.builder.per_seq_group_compute_fns.append(raiser)
+
+    # Run engine.
+    engine.start()
+
+
+@pytest.mark.asyncio
+async def test_failed_inputs(tmp_socket):
+    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
+                           ipc_path=tmp_socket,
+                           run_fn=run_with_evil_input_processing) as engine:
+
+        client = await engine.make_client()
+        assert client.is_running
+
+        # Engine should be healthy
+        await client.check_health()
+
+        async def run_failing_request():
+            async for _ in client.generate(
+                    prompt="Hello my name is",
+                    sampling_params=SamplingParams(max_tokens=10),
+                    request_id="evil" + str(uuid.uuid4())):
+                pass
+
+        async def run_passing_request():
+            async for _ in client.generate(
+                    prompt="Hello my name is",
+                    sampling_params=SamplingParams(max_tokens=10),
+                    request_id=str(uuid.uuid4())):
+                pass
+
+        passing_tasks = [
+            asyncio.create_task(run_passing_request()) for _ in range(10)
+        ]
+        failing_tasks = [
+            asyncio.create_task(run_failing_request()) for _ in range(10)
+        ]
+        await asyncio.gather(*failing_tasks, return_exceptions=True)
+        await asyncio.gather(*passing_tasks)
+
+        # All the bad inputs should have raised
+        for task in failing_tasks:
+            with pytest.raises(RAISED_ERROR):
+                task.result()
+
+        # But all good inputs should have still succeeded
+        for task in passing_tasks:
+            task.result()
+
+        # And the engine should remain healthy
+        assert not client.errored
+        await client.check_health()
+
+        client.close()
--- a/tests/mq_llm_engine/test_load.py
+++ b/tests/mq_llm_engine/test_load.py
@@ -17,9 +17,7 @@ NUM_EXPECTED_TOKENS = 10
 NUM_REQUESTS = 10000

 # Scenarios to test for num generated token.
-ENGINE_ARGS = AsyncEngineArgs(model=MODEL,
-                              load_format="runai_streamer",
-                              disable_log_requests=True)
+ENGINE_ARGS = AsyncEngineArgs(model=MODEL, disable_log_requests=True)


 @pytest.fixture(scope="function")

--- a/tests/mq_llm_engine/utils.py
+++ b/tests/mq_llm_engine/utils.py
@@ -2,7 +2,7 @@

 import asyncio
 import multiprocessing
-from typing import Callable, Tuple, Union
+from typing import Callable, Union

 from vllm import SamplingParams
 from vllm.engine.arg_utils import AsyncEngineArgs
@@ -16,7 +16,7 @@ async def generate(
        client: MQLLMEngineClient,
        request_id: str,
        num_tokens: int,
-        return_output: bool = False) -> Union[RequestOutput, Tuple[int, str]]:
+        return_output: bool = False) -> Union[RequestOutput, tuple[int, str]]:

    final_output = None
    count = 0

--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
 # SPDX-License-Identifier: Apache-2.0

 # Test the AsyncLLMEngine with multi-step-decoding
-from typing import List, Optional
+from typing import Optional

 import pytest
 import os

-from tests.kernels.utils import override_backend_env_variable
+from vllm.utils import STR_BACKEND_ENV_VAR

 from ..models.utils import check_logprobs_close
 from ..utils import (completions_with_server_args, get_client_text_generations,
@@ -18,7 +18,7 @@ MODELS = [
 NUM_SCHEDULER_STEPS = [8]  # Multi-step decoding steps
 NUM_PROMPTS = [10]

-DEFAULT_SERVER_ARGS: List[str] = [
+DEFAULT_SERVER_ARGS: list[str] = [
    "--distributed-executor-backend",
    "ray",
    "--gpu-memory-utilization",
@@ -54,7 +54,7 @@ async def test_multi_step(
    num_logprobs: Optional[int],
    attention_backend: str,
    enable_chunked_prefill: bool,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """Test vLLM engine with multi-step scheduling in an OpenAI-protocol
    client/server environment.
@@ -84,67 +84,70 @@ async def test_multi_step(
        pytest.skip("Multi-step with Chunked-Prefill only supports"
                    "PP=1 and FLASH_ATTN backend")

-    override_backend_env_variable(monkeypatch, attention_backend)
-
-    prompts = example_prompts
-    if len(prompts) < num_prompts:
-        prompts = prompts * ((num_prompts // len(prompts)) + 1)
-    prompts = prompts[:num_prompts]
-    assert len(prompts) == num_prompts
-
-    server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
-    ms_server_args = DEFAULT_SERVER_ARGS + \
-        ["--num-scheduler-steps", f"{num_scheduler_steps}"]
-
-    if not is_async:
-        ms_server_args += ["--disable-async-output-proc"]
-
-    if eager_mode:
-        ms_server_args.append("--enforce-eager")
-
-    if enable_chunked_prefill:
-        ms_server_args.append("--enable-chunked-prefill")
-
-    distributed_args = [
-        "--tensor-parallel-size",
-        str(tp_size),
-        "--pipeline-parallel-size",
-        str(pp_size),
-    ]
-
-    # Spin up client/server & issue completion API requests.
-    # Default `max_wait_seconds` is 240 but was empirically
-    # was raised 5x to 1200 *just for this test* due to
-    # observed timeouts in GHA CI
-    ref_completions = await completions_with_server_args(
-        prompts,
-        model,
-        server_args + distributed_args,
-        num_logprobs,
-        max_wait_seconds=5 * 240)
-    test_completions = await completions_with_server_args(
-        prompts,
-        model,
-        ms_server_args + distributed_args,
-        num_logprobs,
-        max_wait_seconds=5 * 240)
-
-    # Assert multi-step scheduling produces identical tokens
-    # to single-step scheduling.
-    ref_generations = get_client_text_generations(ref_completions)
-    test_generations = get_client_text_generations(test_completions)
-    assert ref_generations == test_generations
-
-    # Assert multi-step scheduling produces nearly-identical logprobs
-    # to single-step scheduling.
-    ref_text_logprobs = get_client_text_logprob_generations(ref_completions)
-    test_text_logprobs = get_client_text_logprob_generations(test_completions)
-    check_logprobs_close(
-        outputs_0_lst=ref_text_logprobs,
-        outputs_1_lst=test_text_logprobs,
-        name_0="hf",
-        name_1="vllm",
-    )
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+
+        prompts = example_prompts
+        if len(prompts) < num_prompts:
+            prompts = prompts * ((num_prompts // len(prompts)) + 1)
+        prompts = prompts[:num_prompts]
+        assert len(prompts) == num_prompts
+
+        server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
+        ms_server_args = DEFAULT_SERVER_ARGS + \
+            ["--num-scheduler-steps", f"{num_scheduler_steps}"]
+
+        if not is_async:
+            ms_server_args += ["--disable-async-output-proc"]
+
+        if eager_mode:
+            ms_server_args.append("--enforce-eager")
+
+        if enable_chunked_prefill:
+            ms_server_args.append("--enable-chunked-prefill")
+
+        distributed_args = [
+            "--tensor-parallel-size",
+            str(tp_size),
+            "--pipeline-parallel-size",
+            str(pp_size),
+        ]
+
+        # Spin up client/server & issue completion API requests.
+        # Default `max_wait_seconds` is 240 but was empirically
+        # was raised 5x to 1200 *just for this test* due to
+        # observed timeouts in GHA CI
+        ref_completions = await completions_with_server_args(
+            prompts,
+            model,
+            server_args + distributed_args,
+            num_logprobs,
+            max_wait_seconds=5 * 240)
+        test_completions = await completions_with_server_args(
+            prompts,
+            model,
+            ms_server_args + distributed_args,
+            num_logprobs,
+            max_wait_seconds=5 * 240)
+
+        # Assert multi-step scheduling produces identical tokens
+        # to single-step scheduling.
+        ref_generations = get_client_text_generations(ref_completions)
+        test_generations = get_client_text_generations(test_completions)
+        assert ref_generations == test_generations
+
+        # Assert multi-step scheduling produces nearly-identical logprobs
+        # to single-step scheduling.
+        ref_text_logprobs = get_client_text_logprob_generations(
+            ref_completions)
+        test_text_logprobs = get_client_text_logprob_generations(
+            test_completions)
+        check_logprobs_close(
+            outputs_0_lst=ref_text_logprobs,
+            outputs_1_lst=test_text_logprobs,
+            name_0="hf",
+            name_1="vllm",
+        )


 @pytest.mark.parametrize(("tp_size, pp_size"), [
@@ -154,7 +157,7 @@ async def test_multi_step(
 async def test_multi_step_pp_smoke(
    tp_size: int,
    pp_size: int,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """
    Smoke test for the vLLM engine with multi-step scheduling in an
@@ -176,54 +179,55 @@ async def test_multi_step_pp_smoke(
    attention_backend = "FLASH_ATTN"
    max_num_seqs = 3

-    override_backend_env_variable(monkeypatch, attention_backend)
-
-    # Prompt from the ShareGPT dataset
-    prompts = [
-        "in the jtbd context whats a push?",  # codespell:ignore
-        "in the jtbd context whats a push?",  # codespell:ignore
-        "in the jtbd context whats a push?",  # codespell:ignore
-        "in the jtbd context whats a push?",  # codespell:ignore
-    ]
-    # Use varying max_tokens to introduce scheduling randomness.
-    max_tokens = [10 * i for i in range(1, len(prompts) + 1)]
-    assert len(prompts) == len(max_tokens)
-
-    test_args = [
-        "--tensor-parallel-size",
-        str(tp_size), "--pipeline-parallel-size",
-        str(pp_size), "--max-num-seqs",
-        str(max_num_seqs)
-    ]
-
-    server_args = DEFAULT_SERVER_ARGS + test_args
-    ms_server_args = DEFAULT_SERVER_ARGS + \
-       ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \
-       test_args
-
-    # Spin up client/server & issue completion API requests.
-    # Default `max_wait_seconds` is 240 but was empirically
-    # was raised 3x to 720 *just for this test* due to
-    # observed timeouts in GHA CI
-    ref_completions = await completions_with_server_args(
-        prompts=prompts,
-        model_name=model,
-        server_cli_args=server_args,
-        num_logprobs=None,
-        max_wait_seconds=5 * 240,
-        max_tokens=max_tokens)
-
-    test_completions = await completions_with_server_args(
-        prompts=prompts,
-        model_name=model,
-        server_cli_args=ms_server_args,
-        num_logprobs=None,
-        max_wait_seconds=5 * 240,
-        max_tokens=max_tokens)
-
-    # Assert multi-step scheduling produces identical tokens
-    # to single-step scheduling.
-    ref_generations = get_client_text_generations(ref_completions)
-    test_generations = get_client_text_generations(test_completions)
-
-    assert ref_generations == test_generations
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+
+        # Prompt from the ShareGPT dataset
+        prompts = [
+            "in the jtbd context whats a push?",  # codespell:ignore
+            "in the jtbd context whats a push?",  # codespell:ignore
+            "in the jtbd context whats a push?",  # codespell:ignore
+            "in the jtbd context whats a push?",  # codespell:ignore
+        ]
+        # Use varying max_tokens to introduce scheduling randomness.
+        max_tokens = [10 * i for i in range(1, len(prompts) + 1)]
+        assert len(prompts) == len(max_tokens)
+
+        test_args = [
+            "--tensor-parallel-size",
+            str(tp_size), "--pipeline-parallel-size",
+            str(pp_size), "--max-num-seqs",
+            str(max_num_seqs)
+        ]
+
+        server_args = DEFAULT_SERVER_ARGS + test_args
+        ms_server_args = DEFAULT_SERVER_ARGS + \
+          ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \
+          test_args
+
+        # Spin up client/server & issue completion API requests.
+        # Default `max_wait_seconds` is 240 but was empirically
+        # was raised 3x to 720 *just for this test* due to
+        # observed timeouts in GHA CI
+        ref_completions = await completions_with_server_args(
+            prompts=prompts,
+            model_name=model,
+            server_cli_args=server_args,
+            num_logprobs=None,
+            max_wait_seconds=5 * 240,
+            max_tokens=max_tokens)
+
+        test_completions = await completions_with_server_args(
+            prompts=prompts,
+            model_name=model,
+            server_cli_args=ms_server_args,
+            num_logprobs=None,
+            max_wait_seconds=5 * 240,
+            max_tokens=max_tokens)
+
+        # Assert multi-step scheduling produces identical tokens
+        # to single-step scheduling.
+        ref_generations = get_client_text_generations(ref_completions)
+        test_generations = get_client_text_generations(test_completions)
+
+        assert ref_generations == test_generations
--- a/tests/multi_step/test_correctness_llm.py
+++ b/tests/multi_step/test_correctness_llm.py
@@ -8,7 +8,7 @@ from typing import Optional
 import pytest
 import os

-from tests.kernels.utils import override_backend_env_variable
+from vllm.utils import STR_BACKEND_ENV_VAR

 from ..models.utils import check_logprobs_close, check_outputs_equal
 from ..utils import models_path_prefix
@@ -44,7 +44,7 @@ def test_multi_step_llm(
    num_prompts: int,
    num_logprobs: Optional[int],
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """Test vLLM engine with multi-step scheduling via sync LLM Engine.

@@ -72,48 +72,49 @@ def test_multi_step_llm(
      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
                    completions endpoint; `None` -> 1 logprob returned.
    """
-    override_backend_env_variable(monkeypatch, attention_backend)
-
-    prompts = example_prompts
-    if len(prompts) < num_prompts:
-        prompts = prompts * ((num_prompts // len(prompts)) + 1)
-    prompts = prompts[:num_prompts]
-    assert len(prompts) == num_prompts
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enforce_eager=enforce_eager,
-            gpu_memory_utilization=0.7,
-            tensor_parallel_size=tp_size,
-            enable_chunked_prefill=enable_chunked_prefill,
-            num_scheduler_steps=num_scheduler_steps,
-    ) as vllm_model:
-        vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
-                        if num_logprobs is None else
-                        vllm_model.generate_greedy_logprobs(
-                            prompts, max_tokens, num_logprobs))
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
-                      if num_logprobs is None else
-                      hf_model.generate_greedy_logprobs_limit(
-                          prompts, max_tokens, num_logprobs))
-
-    if num_logprobs is None:
-        check_outputs_equal(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-    else:
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+
+        prompts = example_prompts
+        if len(prompts) < num_prompts:
+            prompts = prompts * ((num_prompts // len(prompts)) + 1)
+        prompts = prompts[:num_prompts]
+        assert len(prompts) == num_prompts
+
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                tensor_parallel_size=tp_size,
+                enable_chunked_prefill=enable_chunked_prefill,
+                num_scheduler_steps=num_scheduler_steps,
+        ) as vllm_model:
+            vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
+                            if num_logprobs is None else
+                            vllm_model.generate_greedy_logprobs(
+                                prompts, max_tokens, num_logprobs))
+
+        with hf_runner(model, dtype=dtype) as hf_model:
+            hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
+                          if num_logprobs is None else
+                          hf_model.generate_greedy_logprobs_limit(
+                              prompts, max_tokens, num_logprobs))
+
+        if num_logprobs is None:
+            check_outputs_equal(
+                outputs_0_lst=hf_outputs,
+                outputs_1_lst=vllm_outputs,
+                name_0="hf",
+                name_1="vllm",
+            )
+        else:
+            check_logprobs_close(
+                outputs_0_lst=hf_outputs,
+                outputs_1_lst=vllm_outputs,
+                name_0="hf",
+                name_1="vllm",
+            )


 @pytest.mark.parametrize("model", MODELS)
@@ -138,7 +139,7 @@ def test_multi_step_llm_w_prompt_logprobs(
    num_logprobs: Optional[int],
    num_prompt_logprobs: Optional[int],
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """Test prompt logprobs with multi-step scheduling via sync LLM Engine.

@@ -168,47 +169,48 @@ def test_multi_step_llm_w_prompt_logprobs(
                           note that this argument is not supported by the
                           OpenAI completions endpoint.
    """
-    override_backend_env_variable(monkeypatch, attention_backend)
-
-    prompts = example_prompts
-    if len(prompts) < num_prompts:
-        prompts = prompts * ((num_prompts // len(prompts)) + 1)
-    prompts = prompts[:num_prompts]
-    assert len(prompts) == num_prompts
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enforce_eager=enforce_eager,
-            gpu_memory_utilization=0.7,
-            tensor_parallel_size=tp_size,
-            num_scheduler_steps=num_scheduler_steps,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            prompts,
-            max_tokens,
-            num_logprobs,
-            num_prompt_logprobs=num_prompt_logprobs)
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enforce_eager=enforce_eager,
-            gpu_memory_utilization=0.7,
-            tensor_parallel_size=tp_size,
-    ) as vllm_model:
-        single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
-            prompts,
-            max_tokens,
-            num_logprobs,
-            num_prompt_logprobs=num_prompt_logprobs)
-
-    check_logprobs_close(
-        outputs_0_lst=single_step_vllm_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+
+        prompts = example_prompts
+        if len(prompts) < num_prompts:
+            prompts = prompts * ((num_prompts // len(prompts)) + 1)
+        prompts = prompts[:num_prompts]
+        assert len(prompts) == num_prompts
+
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                tensor_parallel_size=tp_size,
+                num_scheduler_steps=num_scheduler_steps,
+        ) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs,
+                num_prompt_logprobs=num_prompt_logprobs)
+
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                tensor_parallel_size=tp_size,
+        ) as vllm_model:
+            single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs,
+                num_prompt_logprobs=num_prompt_logprobs)
+
+        check_logprobs_close(
+            outputs_0_lst=single_step_vllm_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )


 @pytest.mark.parametrize("model", MODELS)
@@ -232,7 +234,7 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
    num_prompts: int,
    num_logprobs: Optional[int],
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """Test vLLM engine with multi-step+"single-step chunked prefill"+APC.

@@ -295,77 +297,78 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
    #
    # The Incorrect scheduling behavior - if it occurs - will cause an exception
    # in the model runner resulting from `do_sample=False`.
-    override_backend_env_variable(monkeypatch, attention_backend)
-
-    assert len(example_prompts) >= 2
-    challenge_prompts = copy.deepcopy(example_prompts)
-    challenge_prompts[0] = ('vLLM is a high-throughput and memory-efficient '
-                            'inference and serving engine for LLMs.\n'
-                            )  # 24 tok
-    challenge_prompts[1] = (
-        'Briefly describe the major milestones in the '
-        'development of artificial intelligence from 1950 to 2020.\n'
-    )  # 30 tok
-
-    # If necessary, adjust the length of `challenge_prompts` to match
-    # `num_prompts`
-    if len(challenge_prompts) < num_prompts:
-        challenge_prompts = (challenge_prompts *
-                             ((num_prompts // len(challenge_prompts)) + 1))
-    challenge_prompts = challenge_prompts[:num_prompts]
-    assert len(challenge_prompts) == num_prompts
-
-    # Single-step scheduler baseline
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enforce_eager=enforce_eager,
-            gpu_memory_utilization=0.7,
-            tensor_parallel_size=tp_size,
-            num_scheduler_steps=num_scheduler_steps,
-            max_model_len=48,
-            max_num_batched_tokens=48,
-            max_num_seqs=4,
-            block_size=16,
-    ) as vllm_model:
-        outputs_baseline = (vllm_model.generate_greedy(
-            challenge_prompts, max_tokens) if num_logprobs is None else
-                            vllm_model.generate_greedy_logprobs(
-                                challenge_prompts, max_tokens, num_logprobs))
-
-    # multi-step+"single-step chunked prefill"+APC
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enforce_eager=enforce_eager,
-            gpu_memory_utilization=0.7,
-            tensor_parallel_size=tp_size,
-            enable_chunked_prefill=True,
-            enable_prefix_caching=True,
-            num_scheduler_steps=num_scheduler_steps,
-            max_model_len=48,
-            max_num_batched_tokens=48,
-            max_num_seqs=4,
-            block_size=16,
-    ) as vllm_model:
-        outputs_w_features = (vllm_model.generate_greedy(
-            challenge_prompts, max_tokens) if num_logprobs is None else
-                              vllm_model.generate_greedy_logprobs(
-                                  challenge_prompts, max_tokens, num_logprobs))
-
-    if num_logprobs is None:
-        # No-logprobs test
-        check_outputs_equal(
-            outputs_0_lst=outputs_baseline,
-            outputs_1_lst=outputs_w_features,
-            name_0="multi-step",
-            name_1="multi-step+features",
-        )
-    else:
-        # Yes-logprobs test
-        check_logprobs_close(
-            outputs_0_lst=outputs_baseline,
-            outputs_1_lst=outputs_w_features,
-            name_0="multi-step",
-            name_1="multi-step+features",
-        )
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+
+        assert len(example_prompts) >= 2
+        challenge_prompts = copy.deepcopy(example_prompts)
+        challenge_prompts[0] = (
+            'vLLM is a high-throughput and memory-efficient '
+            'inference and serving engine for LLMs.\n')  # 24 tok
+        challenge_prompts[1] = (
+            'Briefly describe the major milestones in the '
+            'development of artificial intelligence from 1950 to 2020.\n'
+        )  # 30 tok
+
+        # If necessary, adjust the length of `challenge_prompts` to match
+        # `num_prompts`
+        if len(challenge_prompts) < num_prompts:
+            challenge_prompts = (challenge_prompts *
+                                 ((num_prompts // len(challenge_prompts)) + 1))
+        challenge_prompts = challenge_prompts[:num_prompts]
+        assert len(challenge_prompts) == num_prompts
+
+        # Single-step scheduler baseline
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                tensor_parallel_size=tp_size,
+                num_scheduler_steps=num_scheduler_steps,
+                max_model_len=48,
+                max_num_batched_tokens=48,
+                max_num_seqs=4,
+                block_size=16,
+        ) as vllm_model:
+            outputs_baseline = (
+                vllm_model.generate_greedy(challenge_prompts, max_tokens) if
+                num_logprobs is None else vllm_model.generate_greedy_logprobs(
+                    challenge_prompts, max_tokens, num_logprobs))
+
+        # multi-step+"single-step chunked prefill"+APC
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                tensor_parallel_size=tp_size,
+                enable_chunked_prefill=True,
+                enable_prefix_caching=True,
+                num_scheduler_steps=num_scheduler_steps,
+                max_model_len=48,
+                max_num_batched_tokens=48,
+                max_num_seqs=4,
+                block_size=16,
+        ) as vllm_model:
+            outputs_w_features = (
+                vllm_model.generate_greedy(challenge_prompts, max_tokens) if
+                num_logprobs is None else vllm_model.generate_greedy_logprobs(
+                    challenge_prompts, max_tokens, num_logprobs))
+
+        if num_logprobs is None:
+            # No-logprobs test
+            check_outputs_equal(
+                outputs_0_lst=outputs_baseline,
+                outputs_1_lst=outputs_w_features,
+                name_0="multi-step",
+                name_1="multi-step+features",
+            )
+        else:
+            # Yes-logprobs test
+            check_logprobs_close(
+                outputs_0_lst=outputs_baseline,
+                outputs_1_lst=outputs_w_features,
+                name_0="multi-step",
+                name_1="multi-step+features",
+            )
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -7,18 +7,24 @@ from unittest.mock import MagicMock

 import numpy as np
 import pytest
+import torch
 from transformers import ProcessorMixin

 from vllm.config import ModelConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs,
+                                    MultiModalKwargsItem,
+                                    MultiModalSharedField)
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
-                                        PromptReplacement,
+                                        ProcessingCache, PromptIndexTargets,
+                                        PromptInsertion, PromptReplacement,
+                                        apply_text_matches,
+                                        apply_token_matches,
                                        find_mm_placeholders,
                                        find_text_matches, find_token_matches,
                                        iter_token_matches,
-                                        replace_text_matches,
                                        replace_token_matches)
 # yapf: enable
 from vllm.multimodal.profiling import MultiModalProfiler
@@ -89,6 +95,58 @@ def test_iter_token_matches(token_ids, match_ids, expected):
    assert all(match_len == len(match_ids) for match_len in match_lens)


+# yapf: disable
+@pytest.mark.parametrize(
+    ("token_ids", "match_ids", "new_ids", "expected"),
+    [
+        ([], [], [-1], []),
+        ([], [32000], [-1], []),
+        (
+            [32000, 32000, 32000],
+            [32000],
+            [-1],
+            [-1, -1, -1],
+        ),
+        (
+            [32000, 32000, 32000],
+            [32000, 32000],
+            [-1],
+            [-1, 32000],
+        ),
+        (
+            [32000, 32000, 32000],
+            [32000, 32000, 32000],
+            [-1],
+            [-1],
+        ),
+        (
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+            [28747, 32000],
+            [-1],
+            [9833, -1, 32000, 32000, 9833, -1, 32000, 918],
+        ),
+        (
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+            [28747, 32000, 32000, 32000],
+            [-1],
+            [9833, -1, 9833, 28747, 32000, 32000, 918],
+        ),
+        (
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+            [28747, 0, 32000],
+            [-1],
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+        ),
+    ],
+)
+# yapf: enable
+def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
+    result = replace_token_matches(token_ids, match_ids, new_ids)
+
+    # Manually constructed results
+    assert result == expected
+
+
 # yapf: disable
 @pytest.mark.parametrize(
    ("prompt", "target_by_key", "expected_by_key"),
@@ -98,11 +156,21 @@ def test_iter_token_matches(token_ids, match_ids, expected):
            {
                "pattern_1": [],
                "pattern_2": [32000],
+                "pattern_3": PromptIndexTargets.start(),
+                "pattern_4": PromptIndexTargets.prefix([32000]),
+                "pattern_5": PromptIndexTargets.end(),
            },
            {
                "pattern_1": [],
                "pattern_2": [],
-            }
+                "pattern_3": [
+                    { "start_idx": 0, "end_idx": 0 },
+                ],
+                "pattern_4": [],
+                "pattern_5": [
+                    { "start_idx": 0, "end_idx": 0 },
+                ],
+            },
        ),
        (
            [32000, 32000, 32000, 32000],
@@ -110,6 +178,9 @@ def test_iter_token_matches(token_ids, match_ids, expected):
                "pattern_1": [32000],
                "pattern_2": [32000, 32000],
                "pattern_3": [32000, 32000, 32000],
+                "pattern_4": PromptIndexTargets.start(),
+                "pattern_5": PromptIndexTargets.prefix([32000]),
+                "pattern_6": PromptIndexTargets.end(),
            },
            {
                "pattern_1": [
@@ -125,6 +196,15 @@ def test_iter_token_matches(token_ids, match_ids, expected):
                "pattern_3": [
                    { "start_idx": 0, "end_idx": 3 },
                ],
+                "pattern_4": [
+                    { "start_idx": 0, "end_idx": 0 },
+                ],
+                "pattern_5": [
+                    { "start_idx": 1, "end_idx": 1 },
+                ],
+                "pattern_6": [
+                    { "start_idx": 4, "end_idx": 4 },
+                ],
            },
        ),
        (
@@ -133,6 +213,9 @@ def test_iter_token_matches(token_ids, match_ids, expected):
                "pattern_1": [28747, 32000],
                "pattern_2": [28747, 32000, 32000, 32000],
                "pattern_3": [28747, 0, 32000],
+                "pattern_4": PromptIndexTargets.start(),
+                "pattern_5": PromptIndexTargets.prefix([28747, 32000]),
+                "pattern_6": PromptIndexTargets.end(),
            },
            {
                "pattern_1": [
@@ -143,20 +226,33 @@ def test_iter_token_matches(token_ids, match_ids, expected):
                    { "start_idx": 1, "end_idx": 5 },
                ],
                "pattern_3": [],
+                "pattern_4": [
+                    { "start_idx": 0, "end_idx": 0 },
+                ],
+                "pattern_5": [],
+                "pattern_6": [
+                    { "start_idx": 10, "end_idx": 10 },
+                ],
            },
        ),
    ],
 )
+@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
 # yapf: enable
-def test_find_token_matches(prompt, target_by_key, expected_by_key):
+def test_find_token_matches(
+    prompt,
+    target_by_key,
+    expected_by_key,
+    update_type,
+):
    # Should not be used since there is nothing to convert to token IDs
    mock_tokenizer = cast(AnyTokenizer, object())

-    prompt_repls = [
-        PromptReplacement(key, target, []).bind(mock_tokenizer)
+    prompt_updates = [
+        update_type(key, target, []).bind(mock_tokenizer)
        for key, target in target_by_key.items()
    ]
-    result = find_token_matches(prompt, prompt_repls)
+    result = find_token_matches(prompt, prompt_updates)

    # Only displayed on error
    print("result:", result)
@@ -183,10 +279,20 @@ def test_find_token_matches(prompt, target_by_key, expected_by_key):
            {
                "pattern_1": "",
                "pattern_2": "<image>",
+                "pattern_3": PromptIndexTargets.start(),
+                "pattern_4": PromptIndexTargets.prefix("<image>"),
+                "pattern_5": PromptIndexTargets.end(),
            },
            {
                "pattern_1": [{ "start_idx": 0, "end_idx": 0 }],
                "pattern_2": [],
+                "pattern_3": [
+                    { "start_idx": 0, "end_idx": 0 },
+                ],
+                "pattern_4": [],
+                "pattern_5": [
+                    { "start_idx": 0, "end_idx": 0 },
+                ],
            }
        ),
        (
@@ -195,6 +301,9 @@ def test_find_token_matches(prompt, target_by_key, expected_by_key):
                "pattern_1": "<image>",
                "pattern_2": "<image><image>",
                "pattern_3": "<image><image><image>",
+                "pattern_4": PromptIndexTargets.start(),
+                "pattern_5": PromptIndexTargets.prefix("<image>"),
+                "pattern_6": PromptIndexTargets.end(),
            },
            {
                "pattern_1": [
@@ -210,6 +319,15 @@ def test_find_token_matches(prompt, target_by_key, expected_by_key):
                "pattern_3": [
                    { "start_idx": 0, "end_idx": 21 },
                ],
+                "pattern_4": [
+                    { "start_idx": 0, "end_idx": 0 },
+                ],
+                "pattern_5": [
+                    { "start_idx": 7, "end_idx": 7 },
+                ],
+                "pattern_6": [
+                    { "start_idx": 28, "end_idx": 28 },
+                ],
            },
        ),
        (
@@ -218,6 +336,9 @@ def test_find_token_matches(prompt, target_by_key, expected_by_key):
                "pattern_1": "Image:<image>",
                "pattern_2": "Image:<image><image><image>",
                "pattern_3": "Image:<unk><image>",
+                "pattern_4": PromptIndexTargets.start(),
+                "pattern_5": PromptIndexTargets.prefix("Image:<image>"),
+                "pattern_6": PromptIndexTargets.end(),
            },
            {
                "pattern_1": [
@@ -228,6 +349,15 @@ def test_find_token_matches(prompt, target_by_key, expected_by_key):
                    { "start_idx": 0, "end_idx": 27 },
                ],
                "pattern_3": [],
+                "pattern_4": [
+                    { "start_idx": 0, "end_idx": 0 },
+                ],
+                "pattern_5": [
+                    { "start_idx": 13, "end_idx": 13 },
+                ],
+                "pattern_6": [
+                    { "start_idx": 48, "end_idx": 48 },
+                ],
            },
        ),
        # Test regex escape
@@ -254,16 +384,22 @@ def test_find_token_matches(prompt, target_by_key, expected_by_key):
        ),
    ],
 )
+@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
 # yapf: enable
-def test_find_text_matches(prompt, target_by_key, expected_by_key):
+def test_find_text_matches(
+    prompt,
+    target_by_key,
+    expected_by_key,
+    update_type,
+):
    # Should not be used since there is nothing to convert to text
    mock_tokenizer = cast(AnyTokenizer, object())

-    prompt_repls = [
-        PromptReplacement(key, target, []).bind(mock_tokenizer)
+    prompt_updates = [
+        update_type(key, target, []).bind(mock_tokenizer)
        for key, target in target_by_key.items()
    ]
-    result = find_text_matches(prompt, prompt_repls)
+    result = find_text_matches(prompt, prompt_updates)

    # Only displayed on error
    print("result:", result)
@@ -281,7 +417,7 @@ def test_find_text_matches(prompt, target_by_key, expected_by_key):

 # yapf: disable
 @pytest.mark.parametrize(
-    ("prompt", "target_by_key", "repl_by_key"),
+    ("prompt", "target_by_key", "repl_by_key", "expected_by_update_type_mm_count"),  # noqa: E501
    [
        (
            "Image:<image>Image:<image><image>!",
@@ -300,58 +436,160 @@ def test_find_text_matches(prompt, target_by_key, expected_by_key):
                # Test dynamic replacement (beyond the form of `unit * count`)
                "pattern_3": "?!?",
            },
+            {
+                PromptInsertion: {
+                    0: "Image:<image>Image:<image><image>!",
+                    1: "Image:<image><image><image>Image:<image><image>!?!?",
+                    2: "Image:<image><image><image><image><image>Image:<image><image>!?!??!?",  # noqa: E501
+                },
+                PromptReplacement: {
+                    0: "Image:<image>Image:<image><image>!",
+                    1: "<image><image>Image:<image><image>?!?",
+                    2: "<image><image><image><image><image>?!?",
+                },
+            },
+        ),
+        # Test index targets
+        (
+            "",
+            {
+                "pattern_1": PromptIndexTargets.start(),
+                "pattern_2": PromptIndexTargets.prefix("<image>"),
+                "pattern_3": PromptIndexTargets.end(),
+            },
+            {
+                "pattern_1": "1",
+                "pattern_2": "2",
+                "pattern_3": "3",
+            },
+            {
+                PromptInsertion: {
+                    0: "",
+                    1: "13",
+                    2: "1133",
+                },
+                PromptReplacement: {
+                    0: "",
+                    1: "13",
+                    2: "1133",
+                },
+            },
+        ),
+        (
+            "<image>",
+            {
+                "pattern_1": PromptIndexTargets.start(),
+                "pattern_2": PromptIndexTargets.prefix("<image>"),
+                "pattern_3": PromptIndexTargets.end(),
+            },
+            {
+                "pattern_1": "1",
+                "pattern_2": "2",
+                "pattern_3": "3",
+            },
+            {
+                PromptInsertion: {
+                    0: "<image>",
+                    1: "1<image>23",
+                    2: "11<image>2233",
+                },
+                PromptReplacement: {
+                    0: "<image>",
+                    1: "1<image>23",
+                    2: "11<image>2233",
+                },
+            },
+        ),
+        # Test different replacement per item
+        (
+            "<image><image><image>",
+            {
+                "pattern_1": "<image>",
+            },
+            {
+                "pattern_1": lambda idx: str(idx + 1),
+            },
+            {
+                PromptInsertion: {
+                    0: "<image><image><image>",
+                    1: "<image>1<image><image>",
+                    2: "<image>12<image><image>",
+                },
+                PromptReplacement: {
+                    0: "<image><image><image>",
+                    1: "1<image><image>",
+                    2: "12<image>",
+                },
+            },
+        ),
+        (
+            "<image><image><image>",
+            {
+                "pattern_1": PromptIndexTargets.prefix("<image>"),
+            },
+            {
+                "pattern_1": lambda idx: str(idx + 1),
+            },
+            {
+                PromptInsertion: {
+                    0: "<image><image><image>",
+                    1: "<image>1<image><image>",
+                    2: "<image>12<image><image>",
+                },
+                PromptReplacement: {
+                    0: "<image><image><image>",
+                    1: "<image>1<image><image>",
+                    2: "<image>12<image><image>",
+                },
+            },
        ),
-    ]
-)
-@pytest.mark.parametrize(
-    ("mm_count", "expected"),
-    [
-        (0, "Image:<image>Image:<image><image>!"),
-        (1, "<image><image>Image:<image><image>?!?"),
-        (2, "<image><image><image><image><image>?!?"),
    ]
 )
 # yapf: enable
-def test_find_replace_text(
+def test_find_update_text(
    prompt,
    target_by_key,
    repl_by_key,
-    mm_count,
-    expected,
+    expected_by_update_type_mm_count,
 ):
    # Should not be used since there is nothing to convert to text
    mock_tokenizer = cast(AnyTokenizer, object())

-    mm_prompt_repls = {
-        key: [
-            PromptReplacement(key, target,
-                              repl_by_key[key]).bind(mock_tokenizer)
-        ]
-        for key, target in target_by_key.items()
-    }
-    mm_matches = {
-        key: find_text_matches(prompt, prompt_repls)
-        for key, prompt_repls in mm_prompt_repls.items()
-    }
-
-    result = replace_text_matches(
-        prompt,
-        mm_matches,
-        {key: mm_count
-         for key in repl_by_key},
-    )
-
-    # Only displayed on error
-    print("mm_matches:", mm_matches)
-    print("result:", result)
-
-    # Manually constructed results
-    assert result == expected
+    for (
+            update_type,
+            expected_by_mm_count,
+    ) in expected_by_update_type_mm_count.items():
+        mm_prompt_updates = {
+            key:
+            [update_type(key, target, repl_by_key[key]).bind(mock_tokenizer)]
+            for key, target in target_by_key.items()
+        }
+        mm_matches = {
+            key: find_text_matches(prompt, updates)
+            for key, updates in mm_prompt_updates.items()
+        }
+
+        for mm_count, expected in expected_by_mm_count.items():
+            result = apply_text_matches(
+                prompt,
+                mm_matches,
+                {key: mm_count
+                 for key in repl_by_key},
+            )
+
+            # Only displayed on error
+            print("update_type:", update_type)
+            print("mm_count:", mm_count)
+            print("mm_matches:", mm_matches)
+            print("result:", result)
+
+            # Manually constructed results
+            assert result == expected


 # yapf: disable
 @pytest.mark.parametrize(
-    ("prompt", "target_by_key", "repl_by_key"),
+    ("prompt", "target_by_key", "repl_by_key", "expected_by_update_type_mm_count"),  # noqa: E501
    [
        # Tokenized test cases of `test_find_replace_text`
        # using the vocab of llava-hf/llava-v1.6-mistral-7b-hf
@@ -372,53 +610,155 @@ def test_find_replace_text(
                # Test dynamic replacement (beyond the form of `unit * count`)
                "pattern_3": [1550, 918, 1550],
            },
+            {
+                PromptInsertion: {
+                    0: [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
+                    1: [1, 9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918, 1550, 918, 1550],  # noqa: E501
+                    2: [1, 9833, 28747, 32000, 32000, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918, 1550, 918, 1550, 1550, 918, 1550],  # noqa: E501
+                },
+                PromptReplacement: {
+                    0: [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
+                    1: [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550],  # noqa: E501
+                    2: [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550],
+                },
+            },
+        ),
+        # Test index targets
+        (
+            [],
+            {
+                "pattern_1": PromptIndexTargets.start(),
+                "pattern_2": PromptIndexTargets.prefix([32000]),
+                "pattern_3": PromptIndexTargets.end(),
+            },
+            {
+                "pattern_1": [-1],
+                "pattern_2": [-2],
+                "pattern_3": [-3],
+            },
+            {
+                PromptInsertion: {
+                    0: [],
+                    1: [-1, -3],
+                    2: [-1, -1, -3, -3],
+                },
+                PromptReplacement: {
+                    0: [],
+                    1: [-1, -3],
+                    2: [-1, -1, -3, -3],
+                },
+            },
+        ),
+        (
+            [32000],
+            {
+                "pattern_1": PromptIndexTargets.start(),
+                "pattern_2": PromptIndexTargets.prefix([32000]),
+                "pattern_3": PromptIndexTargets.end(),
+            },
+            {
+                "pattern_1": [-1],
+                "pattern_2": [-2],
+                "pattern_3": [-3],
+            },
+            {
+                PromptInsertion: {
+                    0: [32000],
+                    1: [-1, 32000, -2, -3],
+                    2: [-1, -1, 32000, -2, -2, -3, -3],
+                },
+                PromptReplacement: {
+                    0: [32000],
+                    1: [-1, 32000, -2, -3],
+                    2: [-1, -1, 32000, -2, -2, -3, -3],
+                },
+            },
+        ),
+        # Test different replacement per item
+        (
+            [32000, 32000, 32000],
+            {
+                "pattern_1": [32000],
+            },
+            {
+                "pattern_1": lambda idx: [-(idx + 1)],
+            },
+            {
+                PromptInsertion: {
+                    0: [32000, 32000, 32000],
+                    1: [32000, -1, 32000, 32000],
+                    2: [32000, -1, -2, 32000, 32000],
+                },
+                PromptReplacement: {
+                    0: [32000, 32000, 32000],
+                    1: [-1, 32000, 32000],
+                    2: [-1, -2, 32000],
+                },
+            },
+        ),
+        (
+            [32000, 32000, 32000],
+            {
+                "pattern_1": PromptIndexTargets.prefix([32000]),
+            },
+            {
+                "pattern_1": lambda idx: [-(idx + 1)],
+            },
+            {
+                PromptInsertion: {
+                    0: [32000, 32000, 32000],
+                    1: [32000, -1, 32000, 32000],
+                    2: [32000, -1, -2, 32000, 32000],
+                },
+                PromptReplacement: {
+                    0: [32000, 32000, 32000],
+                    1: [32000, -1, 32000, 32000],
+                    2: [32000, -1, -2, 32000, 32000],
+                },
+            },
        ),
-    ]
-)
-@pytest.mark.parametrize(
-    ("mm_count", "expected"),
-    [
-        (0, [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918]),
-        (1, [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550]),
-        (2, [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550]),
    ]
 )
 # yapf: enable
-def test_find_replace_tokens(
+def test_find_update_tokens(
    prompt,
    target_by_key,
    repl_by_key,
-    mm_count,
-    expected,
+    expected_by_update_type_mm_count,
 ):
    # Should not be used since there is nothing to convert to tokens
    mock_tokenizer = cast(AnyTokenizer, object())

-    mm_prompt_repls = {
-        key: [
-            PromptReplacement(key, target,
-                              repl_by_key[key]).bind(mock_tokenizer)
-        ]
-        for key, target in target_by_key.items()
-    }
-    mm_matches = {
-        key: find_token_matches(prompt, prompt_repls)
-        for key, prompt_repls in mm_prompt_repls.items()
-    }
-
-    result = replace_token_matches(
-        prompt,
-        mm_matches,
-        {key: mm_count
-         for key in repl_by_key},
-    )
-
-    # Only displayed on error
-    print("mm_matches:", mm_matches)
-    print("result:", result)
-
-    # Manually constructed results
-    assert result == expected
+    for (
+            update_type,
+            expected_by_mm_count,
+    ) in expected_by_update_type_mm_count.items():
+        mm_prompt_updates = {
+            key:
+            [update_type(key, target, repl_by_key[key]).bind(mock_tokenizer)]
+            for key, target in target_by_key.items()
+        }
+        mm_matches = {
+            key: find_token_matches(prompt, updates)
+            for key, updates in mm_prompt_updates.items()
+        }
+
+        for mm_count, expected in expected_by_mm_count.items():
+            result = apply_token_matches(
+                prompt,
+                mm_matches,
+                {key: mm_count
+                 for key in repl_by_key},
+            )
+
+            # Only displayed on error
+            print("update_type:", update_type)
+            print("mm_count:", mm_count)
+            print("mm_matches:", mm_matches)
+            print("result:", result)
+
+            # Manually constructed results
+            assert result == expected


 # yapf: disable
@@ -524,22 +864,24 @@ def test_find_replace_tokens(
        ),
    ]
 )
+@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
 # yapf: enable
 def test_find_mm_placeholders(
    repl_by_key,
    prompt,
    expected,
+    update_type,
 ):
    # Should not be used since there is nothing to convert to tokens
    mock_tokenizer = cast(AnyTokenizer, object())

-    mm_prompt_repls = {
-        key: [PromptReplacement(key, [], repl).bind(mock_tokenizer)]
+    mm_prompt_updates = {
+        key: [update_type(key, [], repl).bind(mock_tokenizer)]
        for key, repl in repl_by_key.items()
    }

    result = find_mm_placeholders(
-        mm_prompt_repls,
+        mm_prompt_updates,
        prompt,
        # Effectively match all occurrences in the prompt
        {key: 3
@@ -553,8 +895,46 @@ def test_find_mm_placeholders(
    assert result == expected


+def _dummy_elem(modality: str, key: str, size: int):
+    return MultiModalFieldElem(
+        modality=modality,
+        key=key,
+        data=torch.empty((size, ), dtype=torch.int8),
+        field=MultiModalSharedField(1),
+    )
+
+
+def _dummy_item(modality: str, size_by_key: dict[str, int]):
+    return MultiModalKwargsItem.from_elems([
+        _dummy_elem(modality, key, size) for key, size in size_by_key.items()
+    ])
+
+
+def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
+    return MultiModalKwargs.from_items([
+        _dummy_item(modality, size_by_key)
+        for modality, size_by_key in size_by_key_modality.items()
+    ])
+
+
+# yapf: disable
 @pytest.mark.parametrize(
-    "model_id", ["s3://vllm-ci-model-weights/llava-v1.6-mistral-7b-hf"])
+    ("item", "expected_size"),
+    [
+        (_dummy_item("a", {"a1": 100}), 100),
+        (_dummy_item("a", {"a1": 100, "a2": 110}), 210),
+        (_dummy_kw({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460),  # noqa: E501
+    ],
+)
+# yapf: enable
+def test_cache_item_size(item, expected_size):
+    cache = ProcessingCache.get_lru_cache(2048, type(item))
+    cache[""] = item
+
+    assert cache.currsize == expected_size
+
+
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize(
    ("limit", "num_supported", "is_valid"),
    [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
@@ -570,7 +950,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
        tokenizer_mode="auto",
        trust_remote_code=False,
        seed=0,
-        dtype="half",
+        dtype="auto",
        revision=None,
        limit_mm_per_prompt=limit_mm_per_prompt,
    )
@@ -590,11 +970,10 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
        exc_ctx = pytest.raises(ValueError, match="this model only supports")

    with exc_ctx:
-        profiler.get_dummy_data(model_config.max_model_len)
+        profiler.get_decoder_dummy_data(model_config.max_model_len)


-@pytest.mark.parametrize(
-    "model_id", ["s3://vllm-ci-model-weights/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize(
    ("num_images", "limit", "is_valid"),
    [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
@@ -610,7 +989,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
        tokenizer_mode="auto",
        trust_remote_code=False,
        seed=0,
-        dtype="half",
+        dtype="auto",
        revision=None,
        limit_mm_per_prompt=limit_mm_per_prompt,
    )
@@ -683,7 +1062,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
        tokenizer_mode="auto",
        trust_remote_code=False,
        seed=0,
-        dtype="half",
+        dtype="auto",
        revision=None,
    )


--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -4,7 +4,7 @@ import base64
 import mimetypes
 import os
 from tempfile import NamedTemporaryFile, TemporaryDirectory
-from typing import TYPE_CHECKING, Dict, NamedTuple, Optional, Tuple
+from typing import TYPE_CHECKING, NamedTuple, Optional

 import numpy as np
 import pytest
@@ -33,7 +33,7 @@ TEST_IMAGE_URLS = [


 @pytest.fixture(scope="module")
-def url_images() -> Dict[str, Image.Image]:
+def url_images() -> dict[str, Image.Image]:
    connector = MediaConnector()

    return {
@@ -42,7 +42,7 @@ def url_images() -> Dict[str, Image.Image]:
    }


-def get_supported_suffixes() -> Tuple[str, ...]:
+def get_supported_suffixes() -> tuple[str, ...]:
    # We should at least test the file types mentioned in GPT-4 with Vision
    OPENAI_SUPPORTED_SUFFIXES = ('.png', '.jpeg', '.jpg', '.webp', '.gif')

@@ -69,7 +69,7 @@ async def test_fetch_image_http(image_url: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 @pytest.mark.parametrize("suffix", get_supported_suffixes())
-async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
+async def test_fetch_image_base64(url_images: dict[str, Image.Image],
                                  image_url: str, suffix: str):
    connector = MediaConnector()
    url_image = url_images[image_url]

--- a/tests/neuron/1_core/test_activation.py
+++ b/tests/neuron/1_core/test_activation.py
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm.model_executor.layers.activation import FastGELU, SiluAndMul
+from vllm.platforms import current_platform
+
+
+@pytest.mark.parametrize("activation", ["silu_and_mul", "gelu_fast"])
+@pytest.mark.parametrize("num_tokens,d,dtype", [
+    (7, 512, torch.half),
+    (7, 512, torch.float),
+    (83, 512, torch.half),
+])
+@torch.inference_mode()
+def test_act_and_mul(
+    activation: str,
+    num_tokens: int,
+    d: int,
+    dtype: torch.dtype,
+) -> None:
+    import torch_xla.core.xla_model as xm
+
+    device = xm.xla_device()
+    current_platform.seed_everything(0)
+    torch.set_default_device("cpu")
+    x = torch.randn(num_tokens, 2 * d, dtype=dtype).to(device=device)
+    if activation == "silu_and_mul":
+        layer = SiluAndMul()
+        fn = layer.forward_native
+    elif activation == "gelu_fast":
+        layer = FastGELU()
+        fn = F.gelu
+    else:
+        raise NotImplementedError(
+            f"activation {activation} is not implemented.")
+    assert x.is_xla, "input tensor under testing is expected to be XLA tensor."
+    out = layer.to(device=device).forward_neuron(x)
+    ref_out = fn(x.cpu())
+    torch.testing.assert_close(out.cpu(), ref_out, atol=0.01, rtol=0.0)