[Misc] Fix input processing for Ultravox (#13871)

7ca1da02 · Roger Wang · GitHub · 5157338e · 7ca1da02 · 7ca1da02
Unverified Commit 7ca1da02 authored Feb 25, 2025 by Roger Wang Committed by GitHub Feb 25, 2025
3 changed files
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -83,8 +83,8 @@ def _test_processing_correctness(
    }

    tokenizer_encode_kwargs = {}
-    if model_config.hf_config.model_type in ("mllama", "whisper"):
-        # For some encoder-decoder models, tokenizer will always add bos_token
+    if model_config.hf_config.model_type in ("mllama", "whisper", "ultravox"):
+        # For some multimodal models, tokenizer will always add bos_token
        # at the beginning of prompt by default, causing hf_processor outputs
        # incorrect token ids. So we need use `add_special_tokens=False` here
        # to leave bos_token to be added by the processor.
@@ -172,7 +172,7 @@ def _test_processing_correctness(
    "Qwen/Qwen2-VL-2B-Instruct",
    "Qwen/Qwen2.5-VL-3B-Instruct",
    "Qwen/Qwen2-Audio-7B-Instruct",
-    "fixie-ai/ultravox-v0_5-llama-3_2-1b",
+    "fixie-ai/ultravox-v0_4",
    "openai/whisper-large-v3",
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -284,7 +284,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
    "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct",  # noqa: E501
                                                          min_transformers_version="4.49"),  # noqa: E501
-    "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",
+    "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_4",
                                     trust_remote_code=True),
    # [Encoder-decoder]
    "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501

--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -146,7 +146,8 @@ class UltravoxMultiModalProcessor(
    ) -> BatchFeature:
        # Text-only input not supported in composite processor
        if not mm_data or not mm_data.get("audios", []):
-            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self.info.get_tokenizer().encode(
+                prompt, add_special_tokens=False)
            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")

@@ -185,16 +186,6 @@ class UltravoxMultiModalProcessor(
        )
        return BatchFeature(combined_outputs)

-    def _apply_hf_processor_tokens_only(
-        self,
-        prompt_tokens: list[int],
-    ) -> list[int]:
-        # HF processor omits bos_token_id by setting add_special_tokens=False
-        tokenizer = self.info.get_tokenizer()
-        assert prompt_tokens[0] == tokenizer.bos_token_id
-
-        return prompt_tokens[1:]
-
    def _get_mm_fields_config(
        self,
        hf_inputs: BatchFeature,