[Model] Add user-configurable task for models that support both generation and embedding (#9424)

051eaf6d · Cyrus Leung · GitHub · 7dbe738d · 051eaf6d · 051eaf6d
Unverified Commit 051eaf6d authored Oct 19, 2024 by Cyrus Leung Committed by GitHub Oct 18, 2024
20 changed files
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -294,6 +294,10 @@ Text Embedding
    - 
    - ✅︎
+.. important::
+  Some model architectures support both generation and embedding tasks.
+  In this case, you have to pass :code:`--task embedding` to run the model in embedding mode.
 Reward Modeling
 ---------------
@@ -482,6 +486,10 @@ Multimodal Embedding
    - 🚧
    - ✅︎
+.. important::
+  Some model architectures support both generation and embedding tasks.
+  In this case, you have to pass :code:`--task embedding` to run the model in embedding mode.
 ----
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.

--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -181,8 +181,8 @@ Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruc
 .. code-block:: bash
-    vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \
+    vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
-      --trust-remote-code --limit-mm-per-prompt image=2
+      --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
 .. important::
    Since OpenAI Vision API is based on `Chat Completions <https://platform.openai.com/docs/api-reference/chat>`_ API,

--- a/examples/offline_inference_vision_language_embedding.py
+++ b/examples/offline_inference_vision_language_embedding.py
@@ -7,6 +7,7 @@ prompt = "<|image_1|> Represent the given image with the following question: Wha
 # Create an LLM.
 llm = LLM(
    model="TIGER-Lab/VLM2Vec-Full",
+    task="embedding",
    trust_remote_code=True,
    max_model_len=4096,
    max_num_seqs=2,

--- a/examples/openai_api_client_for_multimodal.py
+++ b/examples/openai_api_client_for_multimodal.py
@@ -7,8 +7,8 @@ Launch the vLLM server with the following command:
 vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
 (multi-image inference with Phi-3.5-vision-instruct)
-vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \
+vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
-    --trust-remote-code --limit-mm-per-prompt image=2
+    --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
 (audio inference with Ultravox)
 vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -25,7 +25,7 @@ from tests.models.utils import (TokensTextLogprobs,
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import TokenizerPoolConfig
+from vllm.config import TaskOption, TokenizerPoolConfig
 from vllm.connections import global_http_connection
 from vllm.distributed import (destroy_distributed_environment,
                              destroy_model_parallel,
@@ -619,6 +619,7 @@ class VllmRunner:
    def __init__(
        self,
        model_name: str,
+        task: TaskOption = "auto",
        tokenizer_name: Optional[str] = None,
        # Use smaller max model length, otherwise bigger model cannot run due
        # to kv cache size limit.
@@ -634,6 +635,7 @@ class VllmRunner:
    ) -> None:
        self.model = LLM(
            model=model_name,
+            task=task,
            tokenizer=tokenizer_name,
            trust_remote_code=True,
            dtype=dtype,

--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -33,7 +33,8 @@ def test_simple():
    num_seq_group = 4
    max_model_len = 16
    max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig(max_num_batched_tokens,
+    scheduler_config = SchedulerConfig("generate",
+                                       max_num_batched_tokens,
                                       num_seq_group,
                                       max_model_len,
                                       enable_chunked_prefill=True)
@@ -78,6 +79,7 @@ def test_chunk():
    max_model_len = 80
    max_num_batched_tokens = 64
    scheduler_config = SchedulerConfig(
+        "generate",
        max_num_batched_tokens,
        max_seqs,
        max_model_len,
@@ -126,6 +128,7 @@ def test_complex():
    max_model_len = 80
    max_num_batched_tokens = 64
    scheduler_config = SchedulerConfig(
+        "generate",
        max_num_batched_tokens,
        max_seqs,
        max_model_len,
@@ -196,6 +199,7 @@ def test_maximal_decoding():
    max_model_len = 8
    max_num_batched_tokens = 2
    scheduler_config = SchedulerConfig(
+        "generate",
        max_num_batched_tokens,
        max_seqs,
        max_model_len,
@@ -289,6 +293,7 @@ def test_prompt_limit():
    max_model_len = 64
    max_num_batched_tokens = 32
    scheduler_config = SchedulerConfig(
+        "generate",
        max_num_batched_tokens,
        max_seqs,
        max_model_len,
@@ -321,7 +326,8 @@ def test_prompt_limit_exceed():
    max_seqs = 64
    max_model_len = 32
    max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig(max_num_batched_tokens,
+    scheduler_config = SchedulerConfig("generate",
+                                       max_num_batched_tokens,
                                       max_seqs,
                                       max_model_len,
                                       enable_chunked_prefill=True)
@@ -348,6 +354,7 @@ def test_swap():
    max_model_len = 200
    max_num_batched_tokens = 30
    scheduler_config = SchedulerConfig(
+        "generate",
        max_num_batched_tokens,
        max_seqs,
        max_model_len,
@@ -404,6 +411,7 @@ def test_running_prefill_prioritized_over_swap():
    max_model_len = 200
    max_num_batched_tokens = 30
    scheduler_config = SchedulerConfig(
+        "generate",
        max_num_batched_tokens,
        max_seqs,
        max_model_len,
@@ -498,6 +506,7 @@ def test_chunked_prefill_preempt():
    max_model_len = 200
    max_num_batched_tokens = 30
    scheduler_config = SchedulerConfig(
+        "generate",
        max_num_batched_tokens,
        max_seqs,
        max_model_len,
@@ -563,6 +572,7 @@ def test_chunked_prefill_max_seqs():
    max_model_len = 80
    max_num_batched_tokens = 64
    scheduler_config = SchedulerConfig(
+        "generate",
        max_num_batched_tokens,
        max_seqs,
        max_model_len,
@@ -617,6 +627,7 @@ def test_perfix_caching():
    max_model_len = 80
    max_num_batched_tokens = 64
    scheduler_config = SchedulerConfig(
+        "generate",
        max_num_batched_tokens,
        max_seqs,
        max_model_len,

--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -20,9 +20,10 @@ from .utils import (append_new_token, append_new_token_seq_group,
 def test_scheduler_add_seq_group():
    block_size = 4
    scheduler_config = SchedulerConfig(
-        100,
+        "generate",
-        64,
+        max_num_batched_tokens=100,
-        1,
+        max_num_seqs=64,
+        max_model_len=1,
    )
    cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto")
    cache_config.num_cpu_blocks = 4
@@ -42,9 +43,10 @@ def test_scheduler_add_seq_group():
 def test_scheduler_abort_seq_group():
    block_size = 4
    scheduler_config = SchedulerConfig(
-        100,
+        "generate",
-        64,
+        max_num_batched_tokens=100,
-        1,
+        max_num_seqs=64,
+        max_model_len=1,
    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 4
@@ -70,9 +72,10 @@ def test_scheduler_schedule_simple():
    num_seq_group = 4
    max_model_len = 16
    scheduler_config = SchedulerConfig(
-        64,
+        "generate",
-        num_seq_group,
+        max_num_batched_tokens=64,
-        max_model_len,
+        max_num_seqs=num_seq_group,
+        max_model_len=max_model_len,
    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 8
@@ -114,9 +117,10 @@ def test_scheduler_prefill_prioritized():
    max_model_len = 30
    max_batched_num_tokens = 30
    scheduler_config = SchedulerConfig(
-        max_batched_num_tokens,
+        "generate",
-        2,
+        max_num_batched_tokens=max_batched_num_tokens,
-        max_model_len,
+        max_num_seqs=2,
+        max_model_len=max_model_len,
    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 16
@@ -145,9 +149,10 @@ def test_scheduler_schedule_preempt_abort():
    block_size = 4
    max_model_len = 16
    scheduler_config = SchedulerConfig(
-        64,
+        "generate",
-        2,
+        max_num_batched_tokens=64,
-        max_model_len,
+        max_num_seqs=2,
+        max_model_len=max_model_len,
    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 2
@@ -204,9 +209,10 @@ def test_scheduler_max_seqs():
    max_seq_group = 2
    max_model_len = 16
    scheduler_config = SchedulerConfig(
-        64,
+        "generate",
-        max_seq_group,
+        max_num_batched_tokens=64,
-        max_model_len,
+        max_num_seqs=max_seq_group,
+        max_model_len=max_model_len,
    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 8
@@ -248,9 +254,10 @@ def test_scheduler_max_seqs():
 def test_scheduler_delay_factor():
    block_size = 4
    scheduler_config = SchedulerConfig(
-        100,
+        "generate",
-        64,
+        max_num_batched_tokens=100,
-        16,
+        max_num_seqs=64,
+        max_model_len=16,
        delay_factor=0.5,
    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
@@ -350,9 +357,10 @@ def initialize_scheduler(
 ):
    block_size = block_size
    scheduler_config = SchedulerConfig(
-        max_token_budget,
+        "generate",
-        max_num_seqs,
+        max_num_batched_tokens=max_token_budget,
-        max_model_len,
+        max_num_seqs=max_num_seqs,
+        max_model_len=max_model_len,
    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = num_cpu_blocks

--- a/tests/core/test_scheduler_encoder_decoder.py
+++ b/tests/core/test_scheduler_encoder_decoder.py
@@ -36,7 +36,12 @@ def test_scheduler_schedule_simple_encoder_decoder():
    block_size = 4
    num_seq_group = 4
    max_model_len = 16
-    scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len)
+    scheduler_config = SchedulerConfig(
+        task="generate",
+        max_num_batched_tokens=64,
+        max_num_seqs=num_seq_group,
+        max_model_len=max_model_len,
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 16  # enc and dec prompts per seq_group
    cache_config.num_gpu_blocks = 16  # enc and dec prompts per seq_group

--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -11,6 +11,7 @@ from typing import List, Literal, NamedTuple, Optional
 import pytest
+from vllm.config import TaskOption
 from vllm.logger import init_logger
 from ..utils import compare_two_settings, fork_new_process_for_each_test
@@ -31,6 +32,7 @@ class ParallelSetup(NamedTuple):
 class PPTestSettings:
    parallel_setups: List[ParallelSetup]
    distributed_backends: List[str]
+    task: TaskOption
    trust_remote_code: bool
    tokenizer_mode: Optional[str]
@@ -39,6 +41,7 @@ class PPTestSettings:
        *,
        tp_base: int = 1,
        pp_base: int = 2,
+        task: TaskOption = "auto",
        trust_remote_code: bool = False,
        tokenizer_mode: Optional[str] = None,
    ):
@@ -66,6 +69,7 @@ class PPTestSettings:
                              chunked_prefill=False),
            ],
            distributed_backends=["mp", "ray"],
+            task=task,
            trust_remote_code=trust_remote_code,
            tokenizer_mode=tokenizer_mode,
        )
@@ -75,6 +79,7 @@ class PPTestSettings:
        *,
        tp_base: int = 1,
        pp_base: int = 2,
+        task: TaskOption = "auto",
        trust_remote_code: bool = False,
        tokenizer_mode: Optional[str] = None,
    ):
@@ -86,6 +91,7 @@ class PPTestSettings:
                              chunked_prefill=False),
            ],
            distributed_backends=["mp"],
+            task=task,
            trust_remote_code=trust_remote_code,
            tokenizer_mode=tokenizer_mode,
        )
@@ -94,7 +100,7 @@ class PPTestSettings:
        for parallel_setup in self.parallel_setups:
            for distributed_backend in self.distributed_backends:
                yield (model_name, parallel_setup, distributed_backend,
-                       self.trust_remote_code, self.tokenizer_mode)
+                       self.task, self.trust_remote_code, self.tokenizer_mode)
 # NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
@@ -213,6 +219,7 @@ def _compare_tp(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
+    task: TaskOption,
    trust_remote_code: bool,
    tokenizer_mode: Optional[str],
    num_gpus_available: int,
@@ -240,6 +247,8 @@ def _compare_tp(
        common_args.append("--enable-chunked-prefill")
    if eager_mode:
        common_args.append("--enforce-eager")
+    if task != "auto":
+        common_args.extend(["--task", task])
    if trust_remote_code:
        common_args.append("--trust-remote-code")
    if tokenizer_mode:
@@ -297,7 +306,7 @@ def _compare_tp(
 @pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend",
+    ("model_name", "parallel_setup", "distributed_backend", "task",
     "trust_remote_code", "tokenizer_mode"),
    [
        params for model_name, settings in GENERATION_MODEL_SETTINGS.items()
@@ -310,6 +319,7 @@ def test_tp_language_generation(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
+    task: TaskOption,
    trust_remote_code: bool,
    tokenizer_mode: Optional[str],
    num_gpus_available,
@@ -317,6 +327,7 @@ def test_tp_language_generation(
    _compare_tp(model_name,
                parallel_setup,
                distributed_backend,
+                task,
                trust_remote_code,
                tokenizer_mode,
                num_gpus_available,
@@ -324,7 +335,7 @@ def test_tp_language_generation(
 @pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend",
+    ("model_name", "parallel_setup", "distributed_backend", "task",
     "trust_remote_code", "tokenizer_mode"),
    [
        params for model_name, settings in EMBEDDING_MODEL_SETTINGS.items()
@@ -337,6 +348,7 @@ def test_tp_language_embedding(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
+    task: TaskOption,
    trust_remote_code: bool,
    tokenizer_mode: Optional[str],
    num_gpus_available,
@@ -344,6 +356,7 @@ def test_tp_language_embedding(
    _compare_tp(model_name,
                parallel_setup,
                distributed_backend,
+                task,
                trust_remote_code,
                tokenizer_mode,
                num_gpus_available,
@@ -351,7 +364,7 @@ def test_tp_language_embedding(
 @pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend",
+    ("model_name", "parallel_setup", "distributed_backend", "task",
     "trust_remote_code", "tokenizer_mode"),
    [
        params for model_name, settings in MULTIMODAL_MODEL_SETTINGS.items()
@@ -364,6 +377,7 @@ def test_tp_multimodal_generation(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
+    task: TaskOption,
    trust_remote_code: bool,
    tokenizer_mode: Optional[str],
    num_gpus_available,
@@ -371,6 +385,7 @@ def test_tp_multimodal_generation(
    _compare_tp(model_name,
                parallel_setup,
                distributed_backend,
+                task,
                trust_remote_code,
                tokenizer_mode,
                num_gpus_available,

--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
+from typing import List
+import pytest
+from vllm import LLM
+from ..openai.test_vision import TEST_IMAGE_URLS
+def test_chat():
+    llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
+    prompt1 = "Explain the concept of entropy."
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": prompt1
+        },
+    ]
+    outputs = llm.chat(messages)
+    assert len(outputs) == 1
+def test_multi_chat():
+    llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
+    prompt1 = "Explain the concept of entropy."
+    prompt2 = "Explain what among us is."
+    conversation1 = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": prompt1
+        },
+    ]
+    conversation2 = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": prompt2
+        },
+    ]
+    messages = [conversation1, conversation2]
+    outputs = llm.chat(messages)
+    assert len(outputs) == 2
+@pytest.mark.parametrize("image_urls",
+                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
+def test_chat_multi_image(image_urls: List[str]):
+    llm = LLM(
+        model="microsoft/Phi-3.5-vision-instruct",
+        dtype="bfloat16",
+        max_model_len=4096,
+        max_num_seqs=5,
+        enforce_eager=True,
+        trust_remote_code=True,
+        limit_mm_per_prompt={"image": 2},
+    )
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *({
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            } for image_url in image_urls),
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+    outputs = llm.chat(messages)
+    assert len(outputs) >= 0
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -6,7 +6,6 @@ import pytest
 from vllm import LLM, RequestOutput, SamplingParams
 from ...conftest import cleanup
-from ..openai.test_vision import TEST_IMAGE_URLS
 MODEL_NAME = "facebook/opt-125m"
@@ -104,90 +103,3 @@ def test_multiple_sampling_params(llm: LLM):
    # sampling_params is None, default params should be applied
    outputs = llm.generate(PROMPTS, sampling_params=None)
    assert len(PROMPTS) == len(outputs)
-def test_chat():
-    llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
-    prompt1 = "Explain the concept of entropy."
-    messages = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": prompt1
-        },
-    ]
-    outputs = llm.chat(messages)
-    assert len(outputs) == 1
-def test_multi_chat():
-    llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
-    prompt1 = "Explain the concept of entropy."
-    prompt2 = "Explain what among us is."
-    conversation1 = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": prompt1
-        },
-    ]
-    conversation2 = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": prompt2
-        },
-    ]
-    messages = [conversation1, conversation2]
-    outputs = llm.chat(messages)
-    assert len(outputs) == 2
-@pytest.mark.parametrize("image_urls",
-                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
-def test_chat_multi_image(image_urls: List[str]):
-    llm = LLM(
-        model="microsoft/Phi-3.5-vision-instruct",
-        dtype="bfloat16",
-        max_model_len=4096,
-        max_num_seqs=5,
-        enforce_eager=True,
-        trust_remote_code=True,
-        limit_mm_per_prompt={"image": 2},
-    )
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *({
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            } for image_url in image_urls),
-            {
-                "type": "text",
-                "text": "What's in this image?"
-            },
-        ],
-    }]
-    outputs = llm.chat(messages)
-    assert len(outputs) >= 0
--- a/tests/entrypoints/llm/test_init.py
+++ b/tests/entrypoints/llm/test_init.py
+import pytest
+from vllm import LLM
+from ...utils import error_on_warning
+MODEL_NAME = "facebook/opt-125m"
+def test_pos_args_deprecated():
+    with error_on_warning(DeprecationWarning):
+        LLM(model=MODEL_NAME, tokenizer=MODEL_NAME)
+    with error_on_warning(DeprecationWarning):
+        LLM(MODEL_NAME, tokenizer=MODEL_NAME)
+    with pytest.warns(DeprecationWarning, match="'tokenizer'"):
+        LLM(MODEL_NAME, MODEL_NAME)
+    with pytest.warns(DeprecationWarning,
+                      match="'tokenizer', 'tokenizer_mode'"):
+        LLM(MODEL_NAME, MODEL_NAME, "auto")
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -22,12 +22,12 @@ class MockHFConfig:
 @dataclass
 class MockModelConfig:
+    task = "generate"
    tokenizer = MODEL_NAME
    trust_remote_code = False
    tokenizer_mode = "auto"
    max_model_len = 100
    tokenizer_revision = None
-    embedding_mode = False
    multimodal_config = MultiModalConfig()
    hf_config = MockHFConfig()

--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -23,6 +23,8 @@ TEST_IMAGE_URLS = [
 @pytest.fixture(scope="module")
 def server():
    args = [
+        "--task",
+        "generate",
        "--dtype",
        "bfloat16",
        "--max-model-len",

--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -18,7 +18,8 @@ PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
 @pytest.fixture(scope="module")
 def phi3v_model_config():
    return ModelConfig(PHI3V_MODEL_ID,
-                       PHI3V_MODEL_ID,
+                       task="generate",
+                       tokenizer=PHI3V_MODEL_ID,
                       tokenizer_mode="auto",
                       trust_remote_code=True,
                       dtype="bfloat16",

--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -15,7 +15,8 @@ def test_worker_apply_lora(sql_lora_files):
    worker = Worker(
        model_config=ModelConfig(
            "meta-llama/Llama-2-7b-hf",
-            "meta-llama/Llama-2-7b-hf",
+            task="auto",
+            tokenizer="meta-llama/Llama-2-7b-hf",
            tokenizer_mode="auto",
            trust_remote_code=False,
            seed=0,
@@ -27,7 +28,7 @@ def test_worker_apply_lora(sql_lora_files):
            load_format="dummy",
        ),
        parallel_config=ParallelConfig(1, 1, False),
-        scheduler_config=SchedulerConfig(32, 32, 32),
+        scheduler_config=SchedulerConfig("generate", 32, 32, 32),
        device_config=DeviceConfig("cuda"),
        cache_config=CacheConfig(block_size=16,
                                 gpu_memory_utilization=1.,

--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
@@ -89,6 +89,7 @@ def run_test(
    # max_model_len should be greater than image_feature_size
    with vllm_runner(model,
+                     task="generate",
                     max_model_len=4096,
                     max_num_seqs=2,
                     dtype=dtype,

--- a/tests/models/embedding/vision_language/test_phi3v.py
+++ b/tests/models/embedding/vision_language/test_phi3v.py
@@ -28,6 +28,7 @@ def test_models(
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
    with vllm_runner(model,
+                     task="embedding",
                     max_model_len=4096,
                     max_num_seqs=2,
                     dtype=dtype,

--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Sequence, Tuple, Union
 import torch
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, TaskOption
 from vllm.inputs import InputContext
 from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
 from vllm.utils import is_cpu
@@ -248,6 +248,7 @@ def check_logprobs_close(
 def build_model_context(model_name: str,
+                        task: TaskOption = "auto",
                        tokenizer_name: Optional[str] = None,
                        trust_remote_code: bool = False,
                        dtype: Optional[Union[str, torch.dtype]] = None,
@@ -273,7 +274,8 @@ def build_model_context(model_name: str,
    model_config = ModelConfig(
        model_name,
-        tokenizer_name,
+        task=task,
+        tokenizer=tokenizer_name,
        tokenizer_mode="auto",
        trust_remote_code=trust_remote_code,
        dtype=dtype,

--- a/tests/multimodal/test_mapper.py
+++ b/tests/multimodal/test_mapper.py
@@ -24,6 +24,7 @@ def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor):
    model_config = ModelConfig(
        model=MODEL_NAME,
+        task="auto",
        tokenizer=MODEL_NAME,
        tokenizer_mode="auto",
        trust_remote_code=False,
@@ -67,6 +68,7 @@ def test_llava_next_image_processor(image_assets, mm_registry, dtype,
    model_config = ModelConfig(
        model=MODEL_NAME,
+        task="auto",
        tokenizer=MODEL_NAME,
        tokenizer_mode="auto",
        trust_remote_code=False,
@@ -109,6 +111,7 @@ def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
    model_config = ModelConfig(
        model=MODEL_NAME,
+        task="auto",
        tokenizer=MODEL_NAME,
        tokenizer_mode="auto",
        trust_remote_code=False,
@@ -139,6 +142,7 @@ def test_image_mapper_multi(image_assets, mm_registry, num_images):
    model_config = ModelConfig(
        model=MODEL_NAME,
+        task="auto",
        tokenizer=MODEL_NAME,
        tokenizer_mode="auto",
        trust_remote_code=False,