Merge remote-tracking branch 'mirror/main'

4b4eeb26 · zhuwenwen · 2216a4e5 · 4fdc581f · 4b4eeb26 · 4b4eeb26
Commit 4b4eeb26 authored Oct 24, 2024 by zhuwenwen
20 changed files
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
+name: 'Close inactive issues and PRs'
+on:
+  schedule:
+    # Daily at 1:30 AM UTC
+    - cron: '30 1 * * *'
+jobs:
+  close-issues-and-pull-requests:
+    permissions:
+      issues: write
+      pull-requests: write
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
+        with:
+          exempt-draft-pr: true
+          exempt-issue-labels: 'keep-open'
+          exempt-pr-labels: 'keep-open'
+          labels-to-add-when-unstale: 'unstale'
+          labels-to-remove-when-stale: 'unstale'
+          days-before-issue-stale: 90
+          days-before-issue-close: 30
+          stale-issue-label: 'stale'
+          stale-issue-message: >
+            This issue has been automatically marked as stale because it has not
+            had any activity within 90 days. It will be automatically closed if no
+            further activity occurs within 30 days. Leave a comment if
+            you feel this issue should remain open. Thank you!
+          close-issue-message: >
+            This issue has been automatically closed due to inactivity. Please
+            feel free to reopen if you feel it is still relevant. Thank you!
+          days-before-pr-stale: 90
+          days-before-pr-close: 30
+          stale-pr-label: 'stale'
+          stale-pr-message: >
+            This pull request has been automatically marked as stale because it
+            has not had any activity within 90 days. It will be automatically
+            closed if no further activity occurs within 30 days. Leave a comment
+            if you feel this pull request should remain open. Thank you!
+          close-pr-message: >
+            This pull request has been automatically closed due to inactivity.
+            Please feel free to reopen if you intend to continue working on it.
+            Thank you!
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -322,7 +322,6 @@ def main(args: argparse.Namespace):
          f"{total_num_tokens / elapsed_time:.2f} tokens/s")
    print(f"Generate Throughput: {total_out_tokens / elapsed_time:.2f} tokens/s")
    # Output JSON results if specified
    if args.output_json:
        results = {

--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -425,7 +425,7 @@ Text Generation
    -
  * - :code:`MolmoForCausalLM`
    - Molmo
-    - Image
+    - T + I
    - :code:`allenai/Molmo-7B-D-0924`, :code:`allenai/Molmo-72B-0924`, etc.
    -
    - ✅︎
@@ -459,6 +459,12 @@ Text Generation
    - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
    -
    - ✅︎
+  * - :code:`Qwen2AudioForConditionalGeneration`
+    - Qwen2-Audio
+    - T + A\ :sup:`+`
+    - :code:`Qwen/Qwen2-Audio-7B-Instruct`
+    -
+    - ✅︎
  * - :code:`Qwen2VLForConditionalGeneration`
    - Qwen2-VL
    - T + I\ :sup:`E+` + V\ :sup:`+`

--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -103,6 +103,23 @@ vllm serve <model> --chat-template ./path-to-chat-template.jinja
 vLLM community provides a set of chat templates for popular models. You can find them in the examples
 directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
+With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies 
+both a `type` and a `text` field. An example is provided below:
+```python
+completion = client.chat.completions.create(
+  model="NousResearch/Meta-Llama-3-8B-Instruct",
+  messages=[
+    {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]}
+  ]
+)
+```
+Most chat templates for LLMs expect the `content` to be a `string` but there are some newer models like 
+`meta-llama/Llama-Guard-3-1B` that expect the content to be parsed with the new OpenAI spec. In order to choose which
+format the content needs to be parsed in by vLLM, please use the `--chat-template-text-format` argument to specify
+between `string` or `openai`. The default value is `string` and vLLM internally converts both spec formats to match 
+this, unless explicitly specified.
 ## Command line arguments for the server
 ```{argparse}

--- a/examples/florence2_inference.py
+++ b/examples/florence2_inference.py
+'''
+Demonstrate prompting of text-to-text
+encoder/decoder models, specifically Florence-2
+'''
+# TODO(Isotr0py):
+# Move to offline_inference_vision_language.py after porting vision backbone
+from vllm import LLM, SamplingParams
+dtype = "float"
+# Create a Florence-2 encoder/decoder model instance
+llm = LLM(
+    model="microsoft/Florence-2-base",
+    tokenizer="facebook/bart-base",
+    dtype=dtype,
+    trust_remote_code=True,
+)
+prompts = [
+    "<CAPTION>", "<DETAILED_CAPTION>", "<MORE_DETAILED_CAPTION>",
+    "<CAPTION_TO_PHRASE_GROUNDING>", "<OD>", "<DENSE_REGION_CAPTION>",
+    "<REGION_PROPOSAL>", "<OCR>", "<OCR_WITH_REGION>"
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(
+    temperature=0,
+    top_p=1.0,
+    min_tokens=0,
+    max_tokens=20,
+)
+# Generate output tokens from the prompts. The output is a list of
+# RequestOutput objects that contain the prompt, generated
+# text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    encoder_prompt = output.encoder_prompt
+    generated_text = output.outputs[0].text
+    print(f"Encoder prompt: {encoder_prompt!r}, "
+          f"Decoder prompt: {prompt!r}, "
+          f"Generated text: {generated_text!r}")
--- a/examples/offline_inference_audio_language.py
+++ b/examples/offline_inference_audio_language.py
@@ -12,14 +12,15 @@ from vllm.assets.audio import AudioAsset
 from vllm.utils import FlexibleArgumentParser
 audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
-question_per_audio_count = [
+question_per_audio_count = {
-    "What is recited in the audio?",
+    0: "What is 1+1?",
-    "What sport and what nursery rhyme are referenced?"
+    1: "What is recited in the audio?",
-]
+    2: "What sport and what nursery rhyme are referenced?"
+}
 # Ultravox 0.3
-def run_ultravox(question, audio_count):
+def run_ultravox(question: str, audio_count: int):
    model_name = "fixie-ai/ultravox-v0_3"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -42,9 +43,29 @@ def run_ultravox(question, audio_count):
    return llm, prompt, stop_token_ids
-model_example_map = {
+# Qwen2-Audio
-    "ultravox": run_ultravox,
+def run_qwen2_audio(question: str, audio_count: int):
-}
+    model_name = "Qwen/Qwen2-Audio-7B-Instruct"
+    llm = LLM(model=model_name,
+              max_model_len=4096,
+              max_num_seqs=5,
+              limit_mm_per_prompt={"audio": audio_count})
+    audio_in_prompt = "".join([
+        f"Audio {idx+1}: "
+        f"<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)
+    ])
+    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+              "<|im_start|>user\n"
+              f"{audio_in_prompt}{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+model_example_map = {"ultravox": run_ultravox, "qwen2_audio": run_qwen2_audio}
 def main(args):
@@ -54,7 +75,7 @@ def main(args):
    audio_count = args.num_audios
    llm, prompt, stop_token_ids = model_example_map[model](
-        question_per_audio_count[audio_count - 1], audio_count)
+        question_per_audio_count[audio_count], audio_count)
    # We set temperature to 0.2 so that outputs can be different
    # even when all prompts are identical when running batch inference.
@@ -62,16 +83,17 @@ def main(args):
                                     max_tokens=64,
                                     stop_token_ids=stop_token_ids)
-    assert args.num_prompts > 0
+    mm_data = {}
-    inputs = {
+    if audio_count > 0:
-        "prompt": prompt,
+        mm_data = {
-        "multi_modal_data": {
            "audio": [
                asset.audio_and_sample_rate
                for asset in audio_assets[:audio_count]
            ]
-        },
+        }
-    }
+    assert args.num_prompts > 0
+    inputs = {"prompt": prompt, "multi_modal_data": mm_data}
    if args.num_prompts > 1:
        # Batch inference
        inputs = [inputs] * args.num_prompts
@@ -100,7 +122,7 @@ if __name__ == "__main__":
    parser.add_argument("--num-audios",
                        type=int,
                        default=1,
-                        choices=[1, 2],
+                        choices=[0, 1, 2],
                        help="Number of audio items per prompt.")
    args = parser.parse_args()

--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -267,6 +267,11 @@ def run_qwen2_vl(question: str, modality: str):
        model=model_name,
        max_model_len=8192,
        max_num_seqs=5,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
+        mm_processor_kwargs={
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
+        },
    )
    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -253,7 +253,9 @@ class HfRunner:
        dtype: str = "half",
        *,
        model_kwargs: Optional[Dict[str, Any]] = None,
+        is_embedding_model: bool = False,
        is_sentence_transformer: bool = False,
+        skip_tokenizer_init: bool = False,
        auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM,
        postprocess_inputs: Callable[[BatchEncoding],
                                     BatchEncoding] = identity,
@@ -281,11 +283,12 @@ class HfRunner:
                    **model_kwargs,
                ))
-        self.tokenizer = AutoTokenizer.from_pretrained(
+        if not skip_tokenizer_init:
-            model_name,
+            self.tokenizer = AutoTokenizer.from_pretrained(
-            torch_dtype=torch_dtype,
+                model_name,
-            trust_remote_code=True,
+                torch_dtype=torch_dtype,
-        )
+                trust_remote_code=True,
+            )
        # don't put this import at the top level
        # it will call torch.cuda.device_count()
@@ -295,6 +298,8 @@ class HfRunner:
            torch_dtype=torch_dtype,
            trust_remote_code=True,
        )
+        if skip_tokenizer_init:
+            self.tokenizer = self.processor.tokenizer
        self.postprocess_inputs = postprocess_inputs
@@ -535,6 +540,7 @@ class HfRunner:
        encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
        max_tokens: int,
        num_logprobs: int,
+        images: Optional[PromptImageInput] = None,
        **kwargs: Any,
    ) -> List[TokensTextLogprobs]:
        '''
@@ -545,11 +551,17 @@ class HfRunner:
        all_output_ids: List[List[int]] = []
        all_output_strs: List[str] = []
-        for (encoder_prompt,
+        for i, (encoder_prompt, decoder_prompt) in enumerate(
-             decoder_prompt) in to_enc_dec_tuple_list(encoder_decoder_prompts):
+                to_enc_dec_tuple_list(encoder_decoder_prompts)):
+            processor_kwargs: Dict[str, Any] = {
+                "text": encoder_prompt,
+                "return_tensors": "pt",
+            }
+            if images is not None and images[i] is not None:
+                processor_kwargs["images"] = images[i]
            encoder_input_ids = self.wrap_device(
-                self.tokenizer(encoder_prompt, return_tensors="pt").input_ids,
+                self.processor(**processor_kwargs).input_ids,
                device=self.model.device.type,
            )

--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -4,7 +4,6 @@ from unittest.mock import MagicMock
 import pytest  # noqa
 from vllm.config import CacheConfig, SchedulerConfig
-from vllm.core.interfaces import AllocStatus
 from vllm.core.scheduler import Scheduler
 from vllm.sequence import Logprob, SequenceGroup
@@ -347,158 +346,6 @@ def test_prompt_limit_exceed():
    assert out.ignored_seq_groups[0] == seq_group
-def test_swap():
-    """Verify swapping works with chunked prefill requests"""
-    block_size = 4
-    max_seqs = 30
-    max_model_len = 200
-    max_num_batched_tokens = 30
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens,
-        max_seqs,
-        max_model_len,
-        enable_chunked_prefill=True,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 16
-    cache_config.num_gpu_blocks = 16
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    _, seq_group = create_dummy_prompt("1",
-                                       prompt_length=60,
-                                       best_of=2,
-                                       block_size=block_size)
-    scheduler.add_seq_group(seq_group)
-    _, out = schedule_and_update_computed_tokens(scheduler)
-    # The request is chunked.
-    # prefill scheduled now.
-    assert len(out.scheduled_seq_groups) == 1
-    assert out.num_prefill_groups == 1
-    assert seq_group.is_prefill()
-    assert out.num_batched_tokens == max_num_batched_tokens
-    # The last request should be swapped out.
-    scheduler.block_manager.can_append_slots = MagicMock()
-    def cannot_append_second_group(seq_group, num_lookahead_slots):
-        return seq_group.request_id != "1"
-    scheduler.block_manager.can_append_slots.side_effect = (
-        cannot_append_second_group)
-    # The running prefill is now swapped.
-    _, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(out.scheduled_seq_groups) == 0
-    assert out.num_batched_tokens == 0
-    assert out.blocks_to_swap_out != []
-    assert out.blocks_to_swap_in == []
-    # Add 1 more task. Swap should be prioritized over new prefill.
-    _, seq_group = create_dummy_prompt("2", prompt_length=60)
-    scheduler.add_seq_group(seq_group)
-    _, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(out.scheduled_seq_groups) == 1
-    # 3 decodes. It is swapped in.
-    assert out.num_batched_tokens == 30
-    assert out.blocks_to_swap_in != []
-    assert out.blocks_to_swap_out == []
-def test_running_prefill_prioritized_over_swap():
-    block_size = 4
-    max_seqs = 30
-    max_model_len = 200
-    max_num_batched_tokens = 30
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens,
-        max_seqs,
-        max_model_len,
-        enable_chunked_prefill=True,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 32
-    cache_config.num_gpu_blocks = 32
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    _, seq_group = create_dummy_prompt("1",
-                                       prompt_length=60,
-                                       best_of=2,
-                                       block_size=block_size)
-    scheduler.add_seq_group(seq_group)
-    _, out = schedule_and_update_computed_tokens(scheduler)
-    # The request is chunked.
-    # prefill scheduled now.
-    assert len(out.scheduled_seq_groups) == 1
-    assert out.num_prefill_groups == 1
-    assert seq_group.is_prefill()
-    assert out.num_batched_tokens == max_num_batched_tokens
-    # The request should be swapped out.
-    scheduler.block_manager.can_append_slots = MagicMock()
-    def cannot_append_second_group(seq_group, num_lookahead_slots):
-        return seq_group.request_id != "1"
-    scheduler.block_manager.can_append_slots.side_effect = (
-        cannot_append_second_group)
-    # The running prefill is now swapped.
-    _, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(out.scheduled_seq_groups) == 0
-    assert out.num_batched_tokens == 0
-    assert out.blocks_to_swap_out != []
-    assert out.blocks_to_swap_in == []
-    # Add 1 more task. Swap is not possible, so prefill is running.
-    scheduler.block_manager.can_swap_in = MagicMock()
-    scheduler.block_manager.can_swap_in.return_value = AllocStatus.LATER
-    _, seq_group2 = create_dummy_prompt("2",
-                                        prompt_length=60,
-                                        block_size=block_size)
-    scheduler.add_seq_group(seq_group2)
-    _, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(out.scheduled_seq_groups) == 1
-    # 3 decodes. It is swapped in.
-    assert out.num_batched_tokens == 30
-    assert out.blocks_to_swap_in == []
-    assert out.blocks_to_swap_out == []
-    assert out.scheduled_seq_groups[0].seq_group == seq_group2
-    # Now although swap is possible, running prefill is prioritized.
-    scheduler.block_manager.can_swap_in.return_value = AllocStatus.OK
-    _, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(out.scheduled_seq_groups) == 1
-    # 3 decodes. It is swapped in.
-    assert out.num_batched_tokens == 30
-    assert out.blocks_to_swap_in == []
-    assert out.blocks_to_swap_out == []
-    assert not seq_group2.is_prefill()
-    assert out.scheduled_seq_groups[0].seq_group == seq_group2
-    append_new_token(seq_group2, 1)
-    # Decoding is prioritized.
-    _, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(out.scheduled_seq_groups) == 1
-    # 3 decodes. It is swapped in.
-    assert out.num_batched_tokens == 1
-    assert out.blocks_to_swap_in == []
-    assert out.blocks_to_swap_out == []
-    assert not seq_group2.is_prefill()
-    assert out.scheduled_seq_groups[0].seq_group == seq_group2
-    append_new_token(seq_group2, 1)
-    # Since we abort the sequence group, we can finally swap.
-    scheduler.abort_seq_group(seq_group2.request_id)
-    _, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(out.scheduled_seq_groups) == 1
-    assert out.num_batched_tokens == 30
-    assert out.blocks_to_swap_in != []
-    assert out.blocks_to_swap_out == []
 def test_chunked_prefill_preempt():
    """Verify preempt works with chunked prefill requests"""
    block_size = 4

--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -10,7 +10,7 @@ from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.core.interfaces import AllocStatus
 from vllm.core.scheduler import Scheduler, SchedulingBudget
 from vllm.lora.request import LoRARequest
-from vllm.sequence import SequenceGroup, SequenceStatus
+from vllm.sequence import SequenceGroup
 from .utils import (append_new_token, append_new_token_seq_group,
                    create_dummy_prompt, get_sequence_groups,
@@ -296,55 +296,6 @@ def test_scheduler_delay_factor():
    append_new_token(out, 1)
-def test_swapped_out_prioritized():
-    block_size = 4
-    scheduler = initialize_scheduler(max_num_seqs=6,
-                                     block_size=block_size,
-                                     num_cpu_blocks=64,
-                                     num_gpu_blocks=64)
-    # best_of=2 * 3 == 6 sequences.
-    for i in range(3):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           best_of=2,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    # prefill scheduled now.
-    assert len(out.scheduled_seq_groups) == 3
-    append_new_token(out, 1)
-    # The last request should be swapped out.
-    scheduler.block_manager.can_append_slots = MagicMock()
-    def cannot_append_second_group(seq_group, num_lookahead_slots):
-        return seq_group.request_id != "2"
-    scheduler.block_manager.can_append_slots.side_effect = (
-        cannot_append_second_group)
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(out.scheduled_seq_groups) == 2
-    assert out.num_batched_tokens == 2
-    assert out.blocks_to_swap_out != []
-    assert out.blocks_to_swap_in == []
-    append_new_token(out, 1)
-    # Add 1 more task. Swap should be prioritized over prefill.
-    _, seq_group = create_dummy_prompt(str(i),
-                                       prompt_length=60,
-                                       best_of=2,
-                                       block_size=block_size)
-    scheduler.add_seq_group(seq_group)
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    append_new_token(out, 1)
-    assert len(out.scheduled_seq_groups) == 3
-    # 3 decodes. It is swapped in.
-    assert out.num_batched_tokens == 3
-    assert out.blocks_to_swap_in != []
-    assert out.blocks_to_swap_out == []
 def initialize_scheduler(
    *,
    max_num_seqs=1000,
@@ -646,60 +597,6 @@ def test_decode_schedule_preempted():
    assert output.blocks_to_copy == []
-def test_decode_swap_beam_search():
-    """
-    Test best_of > 1 swap out blocks
-    """
-    block_size = 4
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_gpu_blocks=64,
-                                     num_cpu_blocks=64)
-    curr_loras = None
-    budget = create_token_budget()
-    for i in range(3):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           best_of=2,
-                                           block_size=block_size)
-        scheduler._allocate_and_set_running(seq_group)
-        scheduler._add_seq_group_to_running(seq_group)
-        append_new_token_seq_group(60, seq_group, 1)
-        budget.add_num_seqs(seq_group.request_id,
-                            seq_group.get_max_num_running_seqs())
-        budget.add_num_batched_tokens(
-            seq_group.request_id, seq_group.num_seqs(SequenceStatus.RUNNING))
-    # The last request should be swapped out.
-    scheduler.block_manager.can_append_slots = MagicMock()
-    def cannot_append_second_group(seq_group, num_lookahead_slots):
-        return seq_group.request_id != "2"
-    scheduler.block_manager.can_append_slots.side_effect = (
-        cannot_append_second_group)
-    scheduler.block_manager.swap_out = MagicMock()
-    expected_swap_mapping = [("5", "7")]
-    scheduler.block_manager.swap_out.return_value = expected_swap_mapping
-    output = scheduler._schedule_running(budget, curr_loras)
-    remainig_running = scheduler.running
-    assert len(remainig_running) == 0
-    assert len(output.decode_seq_groups) == 2
-    assert len(output.prefill_seq_groups) == 0
-    assert output.decode_seq_groups[0].seq_group.request_id == "0"
-    assert output.decode_seq_groups[1].seq_group.request_id == "1"
-    assert len(output.preempted) == 0
-    assert len(output.swapped_out) == 1
-    # Budget should refledct preempted requests.
-    assert budget.num_batched_tokens == 2
-    # since there are 2 sequences, 2 should be subtracted.
-    assert budget.num_curr_seqs == 4
-    # Both should be preempted, not swapped.
-    assert output.blocks_to_swap_out == expected_swap_mapping
-    # Nothing is copied.
-    assert output.blocks_to_copy == []
 def test_schedule_decode_blocks_to_copy_update():
    """
    Verify blocks_to_copy is updated.
@@ -736,105 +633,6 @@ def test_schedule_decode_blocks_to_copy_update():
    assert output.blocks_to_copy == [(2, 3)]
-def test_schedule_swapped_simple():
-    block_size = 4
-    scheduler = initialize_scheduler(block_size=block_size)
-    curr_loras = None
-    blocks_to_swap_out: List[Tuple[int, int]] = []
-    _, seq_group = create_dummy_prompt("1",
-                                       prompt_length=4,
-                                       best_of=2,
-                                       block_size=block_size)
-    scheduler._allocate_and_set_running(seq_group)
-    append_new_token_seq_group(4, seq_group, 1)
-    scheduler._swap_out(seq_group, blocks_to_swap_out)
-    scheduler._add_seq_group_to_swapped(seq_group)
-    budget = create_token_budget()
-    output = scheduler._schedule_swapped(budget, curr_loras)
-    remaining_swapped = scheduler.swapped
-    assert len(remaining_swapped) == 0
-    assert budget.num_batched_tokens == 1
-    assert budget.num_curr_seqs == 2
-    assert len(output.decode_seq_groups) == 1
-    assert len(output.prefill_seq_groups) == 0
-    # swap in is the reverse of swap out
-    blocks_to_swap_in_reverse = []
-    for swapin, swapout in output.blocks_to_swap_in:
-        blocks_to_swap_in_reverse.append((swapout, swapin))
-    assert blocks_to_swap_out == blocks_to_swap_in_reverse
-def test_schedule_swapped_max_token_budget():
-    block_size = 4
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_cpu_blocks=32,
-                                     num_gpu_blocks=32)
-    curr_loras = None
-    blocks_to_swap_out: List[Tuple[int, int]] = []
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i), prompt_length=60, best_of=2)
-        scheduler._allocate_and_set_running(seq_group)
-        append_new_token_seq_group(60, seq_group, 1)
-        scheduler._swap_out(seq_group, blocks_to_swap_out)
-        scheduler._add_seq_group_to_swapped(seq_group)
-    budget = create_token_budget(token_budget=1)
-    output = scheduler._schedule_swapped(budget, curr_loras)
-    remaining_swapped = scheduler.swapped
-    assert len(remaining_swapped) == 1
-    assert budget.num_batched_tokens == 1
-    assert budget.num_curr_seqs == 2
-    assert len(output.decode_seq_groups) == 1
-    assert len(output.prefill_seq_groups) == 0
-    # Verify num_batched_tokens are respected.
-    budget = create_token_budget(token_budget=1)
-    add_token_budget(budget, 1, 0)
-    output = scheduler._schedule_swapped(budget, curr_loras)
-    remaining_swapped = scheduler.swapped
-    assert len(remaining_swapped) == 1
-    assert budget.num_batched_tokens == 1
-    assert budget.num_curr_seqs == 0
-    assert len(output.decode_seq_groups) == 0
-    assert len(output.prefill_seq_groups) == 0
-def test_schedule_swapped_max_seqs():
-    block_size = 4
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_cpu_blocks=64,
-                                     num_gpu_blocks=64)
-    curr_loras = None
-    blocks_to_swap_out: List[Tuple[int, int]] = []
-    for i in range(4):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=4)
-        scheduler._allocate_and_set_running(seq_group)
-        append_new_token_seq_group(60, seq_group, 1)
-        scheduler._swap_out(seq_group, blocks_to_swap_out)
-        scheduler._add_seq_group_to_swapped(seq_group)
-    budget = create_token_budget(max_num_seqs=2)
-    output = scheduler._schedule_swapped(budget, curr_loras)
-    remaining_swapped = scheduler.swapped
-    assert len(remaining_swapped) == 2
-    assert budget.num_batched_tokens == 2
-    assert budget.num_curr_seqs == 2
-    assert len(output.decode_seq_groups) == 2
-    assert len(output.prefill_seq_groups) == 0
-    # Verify num_curr_seqs are respected.
-    output = scheduler._schedule_swapped(budget, curr_loras)
-    remaining_swapped = scheduler.swapped
-    assert len(remaining_swapped) == 2
-    assert budget.num_batched_tokens == 2
-    assert budget.num_curr_seqs == 2
-    assert len(output.decode_seq_groups) == 0
-    assert len(output.prefill_seq_groups) == 0
 def test_schedule_swapped_max_loras():
    block_size = 4
    lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)

--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -118,11 +118,8 @@ class PPTestSettings:
 # The values displayed here are only a rough indicator of the size of the model
 # yapf: disable
-GENERATION_MODEL_SETTINGS = {
+TEXT_GENERATION_MODELS = {
-    # [DETAILED TESTS]
+    # [Decoder-only]
-    "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
-    "microsoft/Phi-3-mini-4k-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True),  # noqa: E501
-    # [FAST TESTS]
    # Uses Llama
    # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
    "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(tp_base=8, trust_remote_code=True),  # noqa: E501
@@ -151,6 +148,7 @@ GENERATION_MODEL_SETTINGS = {
    "core42/jais-13b-chat": PPTestSettings.fast(),
    # TODO: Implement PP
    # "ai21labs/AI21-Jamba-1.5-Mini": PPTestSettings.fast(),
+    "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
    "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True),
    "openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True),
    # Uses Llama
@@ -163,6 +161,7 @@ GENERATION_MODEL_SETTINGS = {
    "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
    "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
    "microsoft/phi-2": PPTestSettings.fast(),
+    "microsoft/Phi-3-mini-4k-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True),  # noqa: E501
    "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
    "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
    "adept/persimmon-8b-chat": PPTestSettings.fast(),
@@ -174,39 +173,40 @@ GENERATION_MODEL_SETTINGS = {
    "upstage/solar-pro-preview-instruct": PPTestSettings.fast(tp_base=2),
    # FIXME: Cannot load tokenizer in latest transformers version
    # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
+    # [Encoder-only]
+    # TODO: Implement PP
+    # "facebook/bart-base": PPTestSettings.fast(),
 }
-EMBEDDING_MODEL_SETTINGS = {  # type: ignore[var-annotated]
+EMBEDDING_MODELS = {  # type: ignore[var-annotated]
-    # [FAST TESTS]
+    # [Text-only]
    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(tp_base=4, trust_remote_code=True),  # noqa: E501
 }
-MULTIMODAL_MODEL_SETTINGS = {
+MULTIMODAL_MODELS = {
-    # [FAST TESTS]
+    # [Decoder-only]
    "Salesforce/blip2-opt-2.7b": PPTestSettings.fast(),
    "facebook/chameleon-7b": PPTestSettings.fast(),
    "adept/fuyu-8b": PPTestSettings.fast(),
+    "THUDM/glm-4v-9b": PPTestSettings.fast(trust_remote_code=True),
    "OpenGVLab/InternVL2-1B": PPTestSettings.fast(trust_remote_code=True),
    "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
    "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
    "llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
    "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(trust_remote_code=True),
-    # TODO: Implement PP
+    "allenai/Molmo-7B-D-0924": PPTestSettings.fast(trust_remote_code=True),
-    # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
    "microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
    "mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"),  # noqa: E501
    "Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
+    "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
    "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
    "fixie-ai/ultravox-v0_3": PPTestSettings.fast(),
-}
+    # [Encoder-decoder]
-CONDITIONAL_GENERATION_MODEL_SETTINGS = {  # type: ignore[var-annotated]
-    # [FAST TESTS]
    # TODO: Implement PP
-    # "facebook/bart-base": PPTestSettings.fast(),
+    # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
 }
 # yapf: enable
@@ -322,7 +322,7 @@ def _compare_tp(
    ("model_name", "parallel_setup", "distributed_backend", "task",
     "test_options"),
    [
-        params for model_name, settings in GENERATION_MODEL_SETTINGS.items()
+        params for model_name, settings in TEXT_GENERATION_MODELS.items()
        for params in settings.iter_params(model_name)
        if model_name in TEST_MODELS
    ],
@@ -349,7 +349,7 @@ def test_tp_language_generation(
    ("model_name", "parallel_setup", "distributed_backend", "task",
     "test_options"),
    [
-        params for model_name, settings in EMBEDDING_MODEL_SETTINGS.items()
+        params for model_name, settings in EMBEDDING_MODELS.items()
        for params in settings.iter_params(model_name)
        if model_name in TEST_MODELS
    ],
@@ -376,7 +376,7 @@ def test_tp_language_embedding(
    ("model_name", "parallel_setup", "distributed_backend", "task",
     "test_options"),
    [
-        params for model_name, settings in MULTIMODAL_MODEL_SETTINGS.items()
+        params for model_name, settings in MULTIMODAL_MODELS.items()
        for params in settings.iter_params(model_name)
        if model_name in TEST_MODELS
    ],

--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -26,6 +26,7 @@ class MockModelConfig:
    tokenizer = MODEL_NAME
    trust_remote_code = False
    tokenizer_mode = "auto"
+    chat_template_text_format = "string"
    max_model_len = 100
    tokenizer_revision = None
    multimodal_config = MultiModalConfig()

--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -8,14 +8,16 @@ from vllm.assets.image import ImageAsset
 from vllm.config import ModelConfig
 from vllm.entrypoints.chat_utils import (parse_chat_messages,
                                         parse_chat_messages_futures)
+from vllm.entrypoints.llm import apply_hf_chat_template
 from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.utils import encode_image_base64
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
+MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="function")
 def phi3v_model_config():
    return ModelConfig(PHI3V_MODEL_ID,
                       task="generate",
@@ -24,6 +26,7 @@ def phi3v_model_config():
                       trust_remote_code=True,
                       dtype="bfloat16",
                       seed=0,
+                       chat_template_text_format="string",
                       limit_mm_per_prompt={
                           "image": 2,
                       })
@@ -39,6 +42,30 @@ def phi3v_tokenizer():
    )
+@pytest.fixture(scope="module")
+def mllama_model_config():
+    return ModelConfig(MLLAMA_MODEL_ID,
+                       task="generate",
+                       tokenizer=MLLAMA_MODEL_ID,
+                       tokenizer_mode="auto",
+                       trust_remote_code=True,
+                       dtype="bfloat16",
+                       seed=0,
+                       limit_mm_per_prompt={
+                           "image": 2,
+                       })
+@pytest.fixture(scope="module")
+def mllama_tokenizer():
+    return TokenizerGroup(
+        MLLAMA_MODEL_ID,
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+    )
 @pytest.fixture(scope="module")
 def image_url():
    image = ImageAsset('cherry_blossom')
@@ -304,6 +331,51 @@ def test_parse_chat_messages_multiple_images_across_messages(
    _assert_mm_data_is_image_input(mm_data, 2)
+def test_parse_chat_messages_context_text_format(
+    phi3v_model_config,
+    phi3v_tokenizer,
+):
+    phi3v_model_config.chat_template_text_format = "openai"
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role": "user",
+            "content": [{
+                "type": "text",
+                "text": "What's in this text?"
+            }]
+        }, {
+            "role": "assistant",
+            "content": "Some stuff."
+        }, {
+            "role": "user",
+            "content": "What about this one?"
+        }], phi3v_model_config, phi3v_tokenizer)
+    assert conversation == [
+        {
+            "role": "user",
+            "content": [{
+                "type": "text",
+                "text": "What's in this text?"
+            }]
+        },
+        {
+            "role": "assistant",
+            "content": [{
+                "type": "text",
+                "text": "Some stuff."
+            }]
+        },
+        {
+            "role": "user",
+            "content": [{
+                "type": "text",
+                "text": "What about this one?"
+            }]
+        },
+    ]
 def test_parse_chat_messages_rejects_too_many_images_in_one_message(
    phi3v_model_config,
    phi3v_tokenizer,
@@ -414,3 +486,153 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
        "<|image_1|>\n<|image_2|>\nWhat's in these images?"
    }]
    _assert_mm_data_is_image_input(mm_data, 2)
+### Mllama currently wraps images / texts as interleaved dictionaries
+def test_mllama_single_image(
+    mllama_model_config,
+    mllama_tokenizer,
+    image_url,
+):
+    """Ensures that a single image is parsed correctly mllama."""
+    conversation, mm_data = parse_chat_messages([{
+        "role":
+        "user",
+        "content": [{
+            'type': 'text',
+            'text': 'The content of this image is:'
+        }, {
+            "image_url": image_url
+        }]
+    }], mllama_model_config, mllama_tokenizer)
+    _assert_mm_data_is_image_input(mm_data, 1)
+    assert conversation == [{
+        'role':
+        'user',
+        'content': [{
+            'type': 'text',
+            'text': 'The content of this image is:'
+        }, {
+            'type': 'image'
+        }]
+    }]
+def test_mllama_interleaved_images(
+    mllama_model_config,
+    mllama_tokenizer,
+    image_url,
+):
+    """Ensures that multiple image are parsed as interleaved dicts."""
+    conversation, mm_data = parse_chat_messages([{
+        "role":
+        "user",
+        "content": [
+            {
+                'type': 'text',
+                'text': 'The content of the first image is:'
+            },
+            {
+                "image_url": image_url
+            },
+            {
+                'type': 'text',
+                'text': 'The content of the second image is:'
+            },
+            {
+                "image_url": image_url
+            },
+        ]
+    }], mllama_model_config, mllama_tokenizer)
+    _assert_mm_data_is_image_input(mm_data, 2)
+    assert conversation == [{
+        'role':
+        'user',
+        'content': [{
+            'type': 'text',
+            'text': 'The content of the first image is:'
+        }, {
+            'type': 'image'
+        }, {
+            'type': 'text',
+            'text': 'The content of the second image is:'
+        }, {
+            'type': 'image'
+        }]
+    }]
+@pytest.mark.parametrize("model", [MLLAMA_MODEL_ID])
+def test_multimodal_image_parsing_matches_hf(model, image_url):
+    """Checks end to end hf alignment for multimodal [image] parsing."""
+    def get_conversation(is_hf: bool):
+        img_part = {"type": "image_url", "image_url": {"url": image_url}}
+        if is_hf:
+            img_part = {'type': 'image'}
+        return [{
+            'role':
+            'user',
+            'content': [
+                {
+                    'type': 'text',
+                    'text': 'The content of the first image is:'
+                },
+                img_part,
+                {
+                    'type': 'text',
+                    'text': 'The content of the second image is:'
+                },
+                img_part,
+                {
+                    'type': 'text',
+                    'text': 'What animal is in the first image?'
+                },
+            ]
+        }]
+    # Build a config for the model
+    model_config = ModelConfig(model,
+                               task="generate",
+                               tokenizer=MLLAMA_MODEL_ID,
+                               tokenizer_mode="auto",
+                               trust_remote_code=True,
+                               dtype="bfloat16",
+                               seed=0,
+                               limit_mm_per_prompt={
+                                   "image": 2,
+                               })
+    # Build the tokenizer group and grab the underlying tokenizer
+    tokenizer_group = TokenizerGroup(
+        MLLAMA_MODEL_ID,
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+    )
+    tokenizer = tokenizer_group.tokenizer
+    # Build and parse a conversation with {"type": "image"} using the tokenizer
+    hf_conversation = get_conversation(is_hf=True)
+    hf_result = tokenizer.apply_chat_template(
+        hf_conversation,
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+    # Now parse with vLLMs chat utils & apply the template
+    vllm_conversation = get_conversation(is_hf=False)
+    conversation, _ = parse_chat_messages(
+        vllm_conversation,
+        model_config,
+        tokenizer_group,
+    )
+    vllm_result = apply_hf_chat_template(
+        tokenizer,
+        conversation=conversation,
+        chat_template=None,
+        add_generation_prompt=True,
+    )
+    assert hf_result == vllm_result
--- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+from typing import Any, Dict, Tuple
+import pytest
+import torch
+from PIL.Image import Image
+from transformers import AutoTokenizer
+from vllm.inputs import InputContext, token_inputs
+from vllm.multimodal import MultiModalRegistry
+from ....conftest import _ImageAssets
+from ...utils import build_model_context
+MODEL = "Qwen/Qwen2-VL-2B-Instruct"
+MIN_PIXELS = "min_pixels"
+MAX_PIXELS = "max_pixels"
+# Fixtures lazy import to avoid initializing CUDA during test collection
+# NOTE: Qwen2vl supports multiple input modalities, so it registers multiple
+# input mappers.
+@pytest.fixture()
+def image_input_mapper_for_qwen2_vl():
+    from vllm.model_executor.models.qwen2_vl import (
+        image_input_mapper_for_qwen2_vl)
+    return image_input_mapper_for_qwen2_vl
+@pytest.fixture()
+def input_processor_for_qwen2_vl():
+    from vllm.model_executor.models.qwen2_vl import (
+        input_processor_for_qwen2_vl)
+    return input_processor_for_qwen2_vl
+@pytest.fixture()
+def qwen2_vl_context() -> InputContext:
+    return build_model_context(model_name=MODEL)
+@pytest.fixture()
+def get_max_qwen2_vl_image_tokens():
+    from vllm.model_executor.models.qwen2_vl import (
+        get_max_qwen2_vl_image_tokens)
+    return get_max_qwen2_vl_image_tokens
+@pytest.fixture()
+def dummy_data_for_qwen2_vl():
+    from vllm.model_executor.models.qwen2_vl import dummy_data_for_qwen2_vl
+    return dummy_data_for_qwen2_vl
+@pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [
+    ({}, 1225),
+    ({
+        MIN_PIXELS: 64**2,
+        MAX_PIXELS: 512**2
+    }, 324),
+])
+def test_qwen2_vl_max_image_tokens(get_max_qwen2_vl_image_tokens,
+                                   qwen2_vl_context: InputContext,
+                                   mm_processor_kwargs: Dict[str, Any],
+                                   expected_max_tokens: int):
+    """Ensure that the max token calc handles min/max pixels properly."""
+    actual_max_tokens = get_max_qwen2_vl_image_tokens(qwen2_vl_context,
+                                                      **mm_processor_kwargs)
+    assert actual_max_tokens == expected_max_tokens
+@pytest.mark.parametrize("mm_processor_kwargs,token_count,img_size", [
+    [{}, 1225, (980, 980)],
+    [{
+        MIN_PIXELS: 64**2,
+        MAX_PIXELS: 512**2
+    }, 324, (504, 504)],
+])
+def test_qwen2_vl_dummy_data(dummy_data_for_qwen2_vl,
+                             qwen2_vl_context: InputContext,
+                             mm_processor_kwargs: Dict[str, Any],
+                             token_count: int, img_size: Tuple[int, int]):
+    """Ensure that the dummy data handles min/max pixels properly."""
+    seq_len = 3000
+    hf_config = qwen2_vl_context.get_hf_config()
+    image_token_id = hf_config.image_token_id
+    # NOTE: video value is required, but isn't actually used
+    # when making the dummy data except for error handling currently
+    seq_data, mm_data = dummy_data_for_qwen2_vl(qwen2_vl_context, seq_len, {
+        "image": 1,
+        "video": 0
+    }, **mm_processor_kwargs)
+    # Ensure we have the right number of placeholders for min/max pixel values
+    assert seq_data.get_token_ids().count(image_token_id) == token_count
+    # Ensure the images were resized correctly
+    image = mm_data["image"]
+    assert isinstance(image, Image)
+    assert image.size == img_size
+@pytest.mark.parametrize("mm_processor_kwargs,num_placeholders", [
+    ({}, 1426),
+    ({
+        MIN_PIXELS: 64**2,
+        MAX_PIXELS: 512**2
+    }, 330),
+])
+def test_input_processor(input_processor_for_qwen2_vl,
+                         qwen2_vl_context: InputContext,
+                         image_assets: _ImageAssets, num_placeholders: int,
+                         mm_processor_kwargs: Dict[str, Any]):
+    """Ensure that the image processor handles min/max pixels properly."""
+    tokenizer = AutoTokenizer.from_pretrained(MODEL)
+    prompt = "<|vision_start|><|image_pad|><|vision_end|>"
+    image = image_assets[0].pil_image
+    hf_config = qwen2_vl_context.get_hf_config()
+    image_token_id = hf_config.image_token_id
+    inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
+                          prompt=prompt,
+                          multi_modal_data={"image": [image]})
+    processed_inputs = input_processor_for_qwen2_vl(qwen2_vl_context, inputs,
+                                                    **mm_processor_kwargs)
+    assert processed_inputs["prompt_token_ids"].count(
+        image_token_id) == num_placeholders
+    assert len(processed_inputs["multi_modal_data"]["image"]) == 1
+@pytest.mark.parametrize("mm_processor_kwargs,pixels_shape", [
+    ({}, [5704, 1176]),
+    ({
+        MIN_PIXELS: 64**2,
+        MAX_PIXELS: 512**2
+    }, [1320, 1176]),
+])
+def test_image_mapper_override(qwen2_vl_context: InputContext,
+                               image_assets: _ImageAssets,
+                               mm_processor_kwargs: Dict[str, Any],
+                               pixels_shape: Tuple[int, int]):
+    """Ensure that the image mapper handles min/max pixels properly."""
+    mm_registry = MultiModalRegistry()
+    mm_registry.init_mm_limits_per_prompt(qwen2_vl_context.model_config)
+    image = image_assets[0].pil_image
+    mapped_output = mm_registry.map_input(
+        qwen2_vl_context.model_config,
+        {"image": image},
+        mm_processor_kwargs=mm_processor_kwargs,
+    )
+    # Dimension 0 of pixel values should match the product of image_grid_thw
+    actual_pixels_shape = mapped_output["pixel_values"].shape
+    assert list(actual_pixels_shape) == pixels_shape
+    assert actual_pixels_shape[0] == torch.prod(
+        mapped_output["image_grid_thw"])
--- a/tests/models/encoder_decoder/vision_language/test_florence2.py
+++ b/tests/models/encoder_decoder/vision_language/test_florence2.py
+from functools import partial
+from typing import List, Optional, Tuple, Type
+import pytest
+from PIL import Image
+from vllm.inputs.data import ExplicitEncoderDecoderPrompt
+from vllm.sequence import SampleLogprobs
+from ....conftest import HfRunner, VllmRunner
+from ...utils import check_logprobs_close
+Florence2Prompt = partial(ExplicitEncoderDecoderPrompt,
+                          decoder_prompt=None,
+                          mm_processor_kwargs=None)
+MODELS = ["microsoft/Florence-2-base"]
+# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
+# Therefore, we borrow the BartTokenizer from the original Bart model
+TOKENIZER = "facebook/bart-base"
+PROMPTS = [
+    Florence2Prompt(encoder_prompt="<CAPTION>"),
+    Florence2Prompt(encoder_prompt="<DETAILED_CAPTION>"),
+    Florence2Prompt(encoder_prompt="<MORE_DETAILED_CAPTION>"),
+    Florence2Prompt(encoder_prompt="<CAPTION_TO_PHRASE_GROUNDING>"),
+    Florence2Prompt(encoder_prompt="<DENSE_REGION_CAPTION>"),
+    Florence2Prompt(encoder_prompt="<REGION_PROPOSAL>"),
+    Florence2Prompt(encoder_prompt="<OCR_WITH_REGION>"),
+    Florence2Prompt(encoder_prompt="<OCR>"),
+    Florence2Prompt(encoder_prompt="<OD>"),
+]
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+                                         Optional[SampleLogprobs]], ):
+    """Sanitize vllm output to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+    hf_output_str = "</s><s>" + output_str + "</s>"
+    return output_ids, hf_output_str, out_logprobs
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    prompts: List[ExplicitEncoderDecoderPrompt],
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+) -> None:
+    with vllm_runner(model,
+                     tokenizer_name=TOKENIZER,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
+            prompts, max_tokens, num_logprobs)
+    # Florence-2 processors require image inputs
+    dummy_image = Image.new(mode="RGB", size=(2, 2))
+    with hf_runner(model, dtype=dtype, skip_tokenizer_init=True) as hf_model:
+        hf_model.model.get_output_embeddings = lambda: \
+            hf_model.model.language_model.lm_head
+        hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
+            prompts,
+            max_tokens,
+            num_logprobs,
+            images=[dummy_image] * len(prompts),
+        ))
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=[
+            vllm_to_hf_output(vllm_output) for vllm_output in vllm_outputs
+        ],
+        name_0="hf",
+        name_1="vllm",
+    )
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(hf_runner, vllm_runner, model, dtype, max_tokens,
+                num_logprobs) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        PROMPTS,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -10,7 +10,7 @@ import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import STR_BACKEND_ENV_VAR, is_hip, is_openvino, is_xpu
+from vllm.utils import STR_BACKEND_ENV_VAR, is_hip, is_openvino
 logger = init_logger(__name__)
@@ -136,7 +136,7 @@ def get_attn_backend(
        from vllm.attention.backends.openvino import OpenVINOAttentionBackend
        return OpenVINOAttentionBackend
    elif backend == _Backend.IPEX:
-        assert is_xpu(), RuntimeError(
+        assert current_platform.is_xpu(), RuntimeError(
            "IPEX attention backend is only used for the XPU device.")
        logger.info("Using IPEX attention backend.")
        from vllm.attention.backends.ipex_attn import IpexAttnBackend
@@ -198,7 +198,7 @@ def which_attn_to_use(
            logger.info("Cannot use %s backend on OpenVINO.", selected_backend)
        return _Backend.OPENVINO
-    if is_xpu():
+    if current_platform.is_xpu():
        if selected_backend != _Backend.IPEX:
            logger.info("Cannot use %s backend on XPU.", selected_backend)
        return _Backend.IPEX

--- a/vllm/benchmarks/benchmark_throughput.py
+++ b/vllm/benchmarks/benchmark_throughput.py
@@ -322,7 +322,6 @@ def main(args: argparse.Namespace):
          f"{total_num_tokens / elapsed_time:.2f} tokens/s")
    print(f"Generate Throughput: {total_out_tokens / elapsed_time:.2f} tokens/s")
    # Output JSON results if specified
    if args.output_json:
        results = {

--- a/vllm/config.py
+++ b/vllm/config.py
@@ -17,7 +17,7 @@ from vllm.transformers_utils.config import (ConfigFormat, get_config,
                                            get_hf_image_processor_config,
                                            get_hf_text_config)
 from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory,
-                        is_hip, is_openvino, is_xpu, print_warning_once)
+                        is_hip, is_openvino, print_warning_once)
 if TYPE_CHECKING:
    from ray.util.placement_group import PlacementGroup
@@ -142,6 +142,7 @@ class ModelConfig:
                 use_async_output_proc: bool = True,
                 override_neuron_config: Optional[Dict[str, Any]] = None,
                 config_format: ConfigFormat = ConfigFormat.AUTO,
+                 chat_template_text_format: str = "string",
                 mm_processor_kwargs: Optional[Dict[str, Any]] = None) -> None:
        self.model = model
        self.tokenizer = tokenizer
@@ -176,6 +177,7 @@ class ModelConfig:
            self.model, revision)
        self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
        self.use_async_output_proc = use_async_output_proc
+        self.chat_template_text_format = chat_template_text_format
        self.mm_processor_kwargs = mm_processor_kwargs
        # Set enforce_eager to False if the value is unset.
@@ -1130,7 +1132,7 @@ class DeviceConfig:
                self.device_type = "tpu"
            elif current_platform.is_cpu():
                self.device_type = "cpu"
-            elif is_xpu():
+            elif current_platform.is_xpu():
                self.device_type = "xpu"
            else:
                raise RuntimeError("Failed to infer device type")

--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -290,7 +290,7 @@ def scheduler_running_outputs_builder():
 def scheduled_seq_group_builder():
-    return ScheduledSequenceGroup(SequenceGroup("", [], -1),
+    return ScheduledSequenceGroup(SequenceGroup.__new__(SequenceGroup),
                                  token_chunk_size=0)
    # return ScheduledSequenceGroup(seq_group=None, token_chunk_size=0)

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -89,6 +89,7 @@ class EngineArgs:
    task: TaskOption = "auto"
    skip_tokenizer_init: bool = False
    tokenizer_mode: str = 'auto'
+    chat_template_text_format: str = 'string'
    trust_remote_code: bool = False
    download_dir: Optional[str] = None
    load_format: str = 'auto'
@@ -250,6 +251,14 @@ class EngineArgs:
            'fast tokenizer if available.\n* "slow" will '
            'always use the slow tokenizer. \n* '
            '"mistral" will always use the `mistral_common` tokenizer.')
+        parser.add_argument(
+            '--chat-template-text-format',
+            type=str,
+            default=EngineArgs.chat_template_text_format,
+            choices=['string', 'openai'],
+            help='The format to render text content within a chat template. '
+            '"string" will keep the content field as a string whereas '
+            '"openai" will parse content in the current OpenAI format.')
        parser.add_argument('--trust-remote-code',
                            action='store_true',
                            help='Trust remote code from huggingface.')
@@ -858,6 +867,7 @@ class EngineArgs:
            # We know this is not None because we set it in __post_init__
            tokenizer=cast(str, self.tokenizer),
            tokenizer_mode=self.tokenizer_mode,
+            chat_template_text_format=self.chat_template_text_format,
            trust_remote_code=self.trust_remote_code,
            dtype=self.dtype,
            seed=self.seed,