[Deprecation][2/N] Replace `--task` with `--runner` and `--convert` (#21470)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>

[Deprecation][2/N] Replace `--task` with `--runner` and `--convert` (#21470)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
86ae693f · Cyrus Leung · GitHub · 8f605ee3 · 86ae693f · 86ae693f
Unverified Commit 86ae693f authored Jul 28, 2025 by Cyrus Leung Committed by GitHub Jul 27, 2025
14 changed files
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -24,13 +24,8 @@ eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
 def _create_proposer(method: str, k: int) -> EagleProposer:
    model_config = ModelConfig(model=model_dir,
-                               task="generate",
+                               runner="generate",
-                               max_model_len=100,
+                               max_model_len=100)
-                               tokenizer=model_dir,
-                               tokenizer_mode="auto",
-                               dtype="auto",
-                               seed=None,
-                               trust_remote_code=False)
    # Choose model directory based on method
    draft_model_dir = eagle_dir if method == "eagle" else eagle3_dir

--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -44,14 +44,7 @@ def test_ngram_proposer():
    def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
        # Dummy model config. Just to set max_model_len.
-        model_config = ModelConfig(model="facebook/opt-125m",
+        model_config = ModelConfig(model="facebook/opt-125m")
-                                   task="generate",
-                                   max_model_len=100,
-                                   tokenizer="facebook/opt-125m",
-                                   tokenizer_mode="auto",
-                                   dtype="auto",
-                                   seed=None,
-                                   trust_remote_code=False)
        return NgramProposer(
            vllm_config=VllmConfig(model_config=model_config,
                                   speculative_config=SpeculativeConfig.

--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -26,10 +26,6 @@ def get_vllm_config():
    )
    model_config = ModelConfig(
        model="facebook/opt-125m",
-        task="generate",
-        tokenizer="facebook/opt-125m",
-        tokenizer_mode="auto",
-        trust_remote_code=True,
        dtype="bfloat16",  # TPUs typically use bfloat16
        seed=42,
    )

--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -76,10 +76,6 @@ def get_vllm_config():
    )
    model_config = ModelConfig(
        model="facebook/opt-125m",
-        task="generate",
-        tokenizer="facebook/opt-125m",
-        tokenizer_mode="auto",
-        trust_remote_code=True,
        dtype="float16",
        seed=42,
    )

--- a/vllm/config.py
+++ b/vllm/config.py
@@ -26,7 +26,7 @@ from pydantic import (ConfigDict, SkipValidation, TypeAdapter, field_validator,
 from pydantic.dataclasses import dataclass
 from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
 from torch.distributed import ProcessGroup, ReduceOp
-from typing_extensions import Self, runtime_checkable
+from typing_extensions import Self, assert_never, runtime_checkable
 import vllm.envs as envs
 from vllm import version
@@ -102,12 +102,63 @@ RunnerOption = Literal["auto", "generate", "pooling", "draft"]
 RunnerType = Literal["generate", "pooling", "draft"]
-_RUNNER_TASKS: dict[RunnerType, list[_ResolvedTask]] = {
+ConvertOption = Literal["auto", "none", "embed", "classify", "reward"]
+ConvertType = Literal["none", "embed", "classify", "reward"]
+_RUNNER_TASKS: dict[RunnerType, list[TaskOption]] = {
    "generate": ["generate", "transcription"],
-    "pooling": ["encode", "embed", "classify", "reward"],
+    "pooling": ["embedding", "embed", "classify", "score", "reward"],
+    "draft": ["draft"],
+}
+_RUNNER_CONVERTS: dict[RunnerType, list[ConvertType]] = {
+    "generate": [],
+    "pooling": ["embed", "classify", "reward"],
    "draft": [],
 }
+# Some model suffixes are based on auto classes from Transformers:
+# https://huggingface.co/docs/transformers/en/model_doc/auto
+# NOTE: Items higher on this list priority over lower ones
+_SUFFIX_TO_DEFAULTS: list[tuple[str, tuple[RunnerType, ConvertType]]] = [
+    ("ForCausalLM", ("generate", "none")),
+    ("ForConditionalGeneration", ("generate", "none")),
+    ("ChatModel", ("generate", "none")),
+    ("LMHeadModel", ("generate", "none")),
+    ("ForTextEncoding", ("pooling", "embed")),
+    ("EmbeddingModel", ("pooling", "embed")),
+    ("ForSequenceClassification", ("pooling", "classify")),
+    ("ForAudioClassification", ("pooling", "classify")),
+    ("ForImageClassification", ("pooling", "classify")),
+    ("ForVideoClassification", ("pooling", "classify")),
+    ("ClassificationModel", ("pooling", "classify")),
+    ("ForRewardModeling", ("pooling", "reward")),
+    ("RewardModel", ("pooling", "reward")),
+    # Let other `*Model`s take priority
+    ("Model", ("pooling", "embed")),
+]
+def iter_architecture_defaults():
+    yield from _SUFFIX_TO_DEFAULTS
+def try_match_architecture_defaults(
+    architecture: str,
+    *,
+    runner_type: Optional[RunnerType] = None,
+    convert_type: Optional[ConvertType] = None,
+) -> Optional[tuple[str, tuple[RunnerType, ConvertType]]]:
+    for suffix, (default_runner_type,
+                 default_convert_type) in iter_architecture_defaults():
+        if ((runner_type is None or runner_type == default_runner_type) and
+            (convert_type is None or convert_type == default_convert_type)
+                and architecture.endswith(suffix)):
+            return suffix, (default_runner_type, default_convert_type)
+    return None
 @runtime_checkable
 class SupportsHash(Protocol):
@@ -236,11 +287,16 @@ class ModelConfig:
    runner: RunnerOption = "auto"
    """The type of model runner to use. Each vLLM instance only supports one
    model runner, even if the same model can be used for multiple types."""
-    task: TaskOption = "auto"
+    convert: ConvertOption = "auto"
-    """The task to use the model for. If the model supports more than one
+    """Convert the model using adapters defined in
-    model runner, this is used to select which model runner to run.
+    [vllm.model_executor.models.adapters][]. The most common use case is to
+    adapt a text generation model to be used for pooling tasks."""
-    Note that the model may support other tasks using the same model runner."""
+    task: Optional[TaskOption] = None
+    """[DEPRECATED] The task to use the model for. If the model supports more
+    than one model runner, this is used to select which model runner to run.
+    Note that the model may support other tasks using the same model runner.
+    """
    tokenizer: SkipValidation[str] = None  # type: ignore
    """Name or path of the Hugging Face tokenizer to use. If unspecified, model
    name or path will be used."""
@@ -558,48 +614,103 @@ class ModelConfig:
        self.hf_image_processor_config = get_hf_image_processor_config(
            self.model, hf_token=self.hf_token, revision=self.revision)
-        # For pooling models, self.task is used to indicate the
+        architectures = self.architectures
-        # user-selected task
+        registry = self.registry
-        if self.task == "score":
+        is_generative_model = registry.is_text_generation_model(
-            if self._is_classify_task(self.architectures):
+            architectures, self)
-                self.task = "classify"
+        is_pooling_model = registry.is_pooling_model(architectures, self)
+        def _task_to_convert(task: TaskOption) -> ConvertType:
+            if task == "embedding" or task == "embed":
+                return "embed"
+            if task == "classify":
+                return "classify"
+            if task == "reward":
+                return "reward"
+            if task == "score":
+                new_task = self._get_default_pooling_task(architectures)
+                return "classify" if new_task == "classify" else "embed"
+            return "none"
+        if self.task is not None:
+            runner: RunnerOption = "auto"
+            convert: ConvertOption = "auto"
+            msg_prefix = ("The 'task' option has been deprecated and will be "
+                          "removed in v0.13.0 or v1.0, whichever comes first.")
+            msg_hint = "Please remove this option."
+            is_generative_task = self.task in _RUNNER_TASKS["generate"]
+            is_pooling_task = self.task in _RUNNER_TASKS["pooling"]
+            if is_generative_model and is_pooling_model:
+                if is_generative_task:
+                    runner = "generate"
+                    convert = "auto"
+                    msg_hint = ("Please replace this option with `--runner "
+                                "generate` to continue using this model "
+                                "as a generative model.")
+                elif is_pooling_task:
+                    runner = "pooling"
+                    convert = "auto"
+                    msg_hint = ("Please replace this option with `--runner "
+                                "pooling` to continue using this model "
+                                "as a pooling model.")
+                else:  # task == "auto"
+                    pass
+            elif is_generative_model or is_pooling_model:
+                if is_generative_task:
+                    runner = "generate"
+                    convert = "auto"
+                    msg_hint = "Please remove this option"
+                elif is_pooling_task:
+                    runner = "pooling"
+                    convert = _task_to_convert(self.task)
+                    msg_hint = ("Please replace this option with `--convert "
+                                f"{convert}` to continue using this model "
+                                "as a pooling model.")
+                else:  # task == "auto"
+                    pass
            else:
-                self.task = "embed"
+                raise AssertionError("The model should be a generative or "
-        elif self.task == "embedding":
+                                     "pooling model when task is set to "
-            msg = ("The 'embedding' task has been renamed to 'embed', please "
+                                     f"{self.task!r}.")
-                   "use the new name. The old name will be removed in v1.0.")
+            self.runner = runner
+            self.convert = convert
+            msg = f"{msg_prefix} {msg_hint}"
            warnings.warn(msg, DeprecationWarning, stacklevel=2)
-            self.task = "embed"
+        self.runner_type = self._get_runner_type(architectures, self.runner)
+        self.convert_type = self._get_convert_type(architectures,
+                                                   self.runner_type,
+                                                   self.convert)
+        if self.runner_type == "generate" and not is_generative_model:
+            generate_converts = _RUNNER_CONVERTS["generate"]
+            if self.convert_type not in generate_converts:
+                # Currently we don't have any converters for generative models
+                raise ValueError(
+                    "This model does not support `--runner generate`.")
+        if self.runner_type == "pooling" and not is_pooling_model:
+            pooling_converts = _RUNNER_CONVERTS["pooling"]
+            if self.convert_type not in pooling_converts:
+                convert_option = "<" + "|".join(pooling_converts) + ">"
+                raise ValueError(
+                    "This model does not support `--runner pooling`. "
+                    f"You can pass `--convert {convert_option} to adapt "
+                    "it into a pooling model.")
-        model_info, arch = self.registry.inspect_model_cls(self.architectures)
+        self.supported_tasks = self._get_supported_tasks(
+            architectures, self.runner_type, self.convert_type)
+        # Note: Initialize these attributes early because transformers fallback
+        # may fail to load dynamic modules in child processes
+        model_info, arch = registry.inspect_model_cls(architectures, self)
        self._model_info = model_info
        self._architecture = arch
+        logger.info("Resolved architecture: %s", arch)
-        all_supported_tasks = self._get_supported_tasks(self.task)
-        logger.debug("Tasks supported by runner type: %s", all_supported_tasks)
-        supported_runner_types = self._get_supported_runner_types(
-            all_supported_tasks)
-        runner_type = self._resolve_runner(self.runner, self.task,
-                                           supported_runner_types,
-                                           all_supported_tasks)
-        logger.debug("Selected runner type: %s", runner_type)
-        # For pooling models, self.task is used to indicate the
-        # user-selected task
-        if runner_type == "pooling" and self.task == "auto":
-            selected_task = all_supported_tasks[runner_type][-1]
-            assert selected_task != "encode"
-            self.task = selected_task
-        self.supported_runner_types = supported_runner_types
-        self.runner_type = runner_type
-        self.supported_tasks = all_supported_tasks[runner_type]
-        if self.runner_type in ("draft",
-                                "generate") and self.task != "transcription":
-            self.truncation_side = "left"
-        else:
-            self.truncation_side = "right"
        self.pooler_config = self._init_pooler_config()
@@ -652,16 +763,10 @@ class ModelConfig:
        self.original_max_model_len = self.max_model_len
        self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
        self.multimodal_config = self._init_multimodal_config()
-        self.model_supports_multimodal_raw_input = (
-            self.registry.supports_multimodal_raw_input(self.architectures))
        if not self.skip_tokenizer_init:
            self._verify_tokenizer_mode()
-        self.is_attention_free = self._init_attention_free()
-        self.is_hybrid = self._init_is_hybrid()
-        self.has_noops = self._init_has_noops()
-        self.has_inner_state = self._init_has_inner_state()
        if (not current_platform.is_neuron() and self.override_neuron_config):
            raise ValueError(
                "`override_neuron_config` is only supported on Neuron.")
@@ -702,30 +807,13 @@ class ModelConfig:
    @property
    def architectures(self) -> list[str]:
-        # architectures in the model config.
+        return getattr(self.hf_config, "architectures", [])
-        architectures = getattr(self.hf_config, "architectures", [])
-        # The registry assumes that it can always inspect the vLLM model class
-        # for a given architecture. This assumption breaks down for the
-        # Transformers backend, which may use a different class depending on
-        # the model type. To work around this, we add the correct Transformers
-        # backend class to the architectures list. We must do this here because
-        # we need access to the `hf_config` to determine the backend class.
-        transformers_backend_cls = self._get_transformers_backend_cls()
-        if (self.model_impl != ModelImpl.VLLM.value
-                and all(arch != transformers_backend_cls
-                        for arch in architectures)):
-            architectures.append(transformers_backend_cls)
-        return architectures
    @property
    def architecture(self) -> str:
-        # The architecture vllm actually used.
+        """The architecture vllm actually used."""
        return self._architecture
-    @property
-    def model_info(self):
-        return self._model_info
    def maybe_pull_model_tokenizer_for_s3(self, model: str,
                                          tokenizer: str) -> None:
        """Pull model/tokenizer from S3 to temporary directory when needed.
@@ -763,7 +851,7 @@ class ModelConfig:
            self.tokenizer = s3_tokenizer.dir
    def _init_multimodal_config(self) -> Optional["MultiModalConfig"]:
-        if self.registry.is_multimodal_model(self.architectures):
+        if self.registry.is_multimodal_model(self.architectures, self):
            return MultiModalConfig(
                limit_per_prompt=self.limit_mm_per_prompt,
                media_io_kwargs=self.media_io_kwargs,
@@ -819,19 +907,6 @@ class ModelConfig:
        return None
-    def _init_attention_free(self) -> bool:
-        return self.registry.is_attention_free_model(self.architectures)
-    def _init_is_hybrid(self) -> bool:
-        return self.registry.is_hybrid_model(self.architectures)
-    def _init_has_noops(self) -> bool:
-        architectures = getattr(self.hf_config, "architectures", [])
-        return self.registry.is_noops_model(architectures)
-    def _init_has_inner_state(self) -> bool:
-        return self.registry.model_has_inner_state(self.architectures)
    def _verify_tokenizer_mode(self) -> None:
        tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower())
        if tokenizer_mode not in get_args(TokenizerMode):
@@ -840,155 +915,168 @@ class ModelConfig:
                f"one of {get_args(TokenizerMode)}.")
        self.tokenizer_mode = tokenizer_mode
-    def _is_classify_task(self, architectures: list[str]):
+    def _get_default_runner_type(
+        self,
+        architectures: list[str],
+    ) -> RunnerType:
+        registry = self.registry
+        # Some Sentence Transformers models use *ForCausalLM archs
+        if get_pooling_config(self.model, self.revision):
+            return "pooling"
        for arch in architectures:
-            if arch.endswith("ForSequenceClassification"):
+            if arch in registry.get_supported_archs():
-                return True
+                if registry.is_pooling_model(architectures, self):
-        return self.registry.is_cross_encoder_model(architectures)
+                    return "pooling"
+                if registry.is_text_generation_model(architectures, self):
+                    return "generate"
+            match = try_match_architecture_defaults(arch)
+            if match:
+                _, (runner_type, _) = match
+                return runner_type
+        return "generate"
-    def _get_preferred_pooling_task(
+    def _get_runner_type(
        self,
        architectures: list[str],
-    ) -> _ResolvedTask:
+        runner: RunnerOption,
-        model_id = self.model
+    ) -> RunnerType:
-        if get_pooling_config(model_id, self.revision):
+        if runner != "auto":
+            return runner
+        runner_type = self._get_default_runner_type(architectures)
+        logger.info(
+            "Resolved `--runner auto` to `--runner %s`. "
+            "Pass the value explicitly to silence this message.", runner_type)
+        return runner_type
+    def _get_default_convert_type(
+        self,
+        architectures: list[str],
+        runner_type: RunnerType,
+    ) -> ConvertType:
+        registry = self.registry
+        for arch in architectures:
+            if arch in registry.get_supported_archs():
+                if (runner_type == "generate"
+                        and registry.is_text_generation_model(
+                            architectures, self)):
+                    return "none"
+                if (runner_type == "pooling"
+                        and registry.is_pooling_model(architectures, self)):
+                    return "none"
+            match = try_match_architecture_defaults(arch,
+                                                    runner_type=runner_type)
+            if match:
+                _, (_, convert_type) = match
+                return convert_type
+        # This is to handle Sentence Transformers models that use *ForCausalLM
+        # and also multi-modal pooling models which are not defined as
+        # Sentence Transformers models
+        if runner_type == "pooling":
            return "embed"
-        if self.registry.is_transcription_model(architectures):
-            return "transcription"
-        suffix_to_preferred_task: list[tuple[str, _ResolvedTask]] = [
+        return "none"
-            # Other models follow this pattern
-            ("EmbeddingModel", "embed"),
+    def _get_convert_type(
-            ("RewardModel", "reward"),
+        self,
-        ]
+        architectures: list[str],
+        runner_type: RunnerType,
+        convert: ConvertOption,
+    ) -> ConvertType:
+        if convert != "auto":
+            return convert
-        for suffix, pref_task in suffix_to_preferred_task:
+        convert_type = self._get_default_convert_type(architectures,
-            if self.architecture.endswith(suffix):
+                                                      runner_type)
-                return pref_task
-        return "embed"
+        logger.info(
+            "Resolved `--convert auto` to `--convert %s`. "
+            "Pass the value explicitly to silence this message.", convert_type)
+        return convert_type
    def _get_supported_generation_tasks(
        self,
-        task_option: TaskOption,
+        architectures: list[str],
+        convert_type: ConvertType,
    ) -> list[_ResolvedTask]:
        registry = self.registry
-        architectures = self.architectures
-        if registry.is_transcription_only_model(architectures):
+        if registry.is_transcription_only_model(architectures, self):
            return ["transcription"]
+        # TODO: Use get_supported_generation_tasks once V0 is removed
        supported_tasks = list[_ResolvedTask]()
-        if registry.is_text_generation_model(architectures):
+        if (registry.is_text_generation_model(architectures, self)
+                or convert_type in _RUNNER_CONVERTS["generate"]):
            supported_tasks.append("generate")
-            if registry.is_transcription_model(architectures):
+        if registry.is_transcription_model(architectures, self):
-                supported_tasks.append("transcription")
+            supported_tasks.append("transcription")
        return supported_tasks
+    def _get_default_pooling_task(
+        self,
+        architectures: list[str],
+    ) -> Literal["embed", "classify", "reward"]:
+        if self.registry.is_cross_encoder_model(architectures, self):
+            return "classify"
+        for arch in architectures:
+            match = try_match_architecture_defaults(arch,
+                                                    runner_type="pooling")
+            if match:
+                _, (_, convert_type) = match
+                assert convert_type != "none"
+                return convert_type
+        return "embed"
    def _get_supported_pooling_tasks(
        self,
-        task_option: TaskOption,
+        architectures: list[str],
+        convert_type: ConvertType,
    ) -> list[_ResolvedTask]:
        registry = self.registry
-        architectures = self.architectures
+        # TODO: Use get_supported_pooling_tasks once V0 is removed
        supported_tasks = list[_ResolvedTask]()
-        if registry.is_pooling_model(architectures):
+        if (registry.is_pooling_model(architectures, self)
+                or convert_type in _RUNNER_CONVERTS["pooling"]):
            supported_tasks.append("encode")
-            # For now, users must specify the task (other than "pooling")
+            extra_task = (self._get_default_pooling_task(architectures)
-            # to use for pooling models
+                          if convert_type == "none" else convert_type)
-            if task_option == "auto":
+            supported_tasks.append(extra_task)
-                preferred_task = self._get_preferred_pooling_task(
-                    architectures)
-                supported_tasks.append(preferred_task)
-            elif task_option in _RUNNER_TASKS["pooling"]:
-                supported_tasks.append(cast(_ResolvedTask, task_option))
        return supported_tasks
    def _get_supported_tasks(
        self,
-        task_option: TaskOption,
+        architectures: list[str],
-    ) -> dict[RunnerType, list[_ResolvedTask]]:
+        runner_type: RunnerType,
-        if self._is_classify_task(self.architectures):
+        convert_type: ConvertType,
-            return {"generate": [], "pooling": ["classify"], "draft": []}
+    ) -> list[_ResolvedTask]:
-        else:
+        if runner_type == "generate":
-            return {
+            return self._get_supported_generation_tasks(
-                "generate": self._get_supported_generation_tasks(task_option),
+                architectures, convert_type)
-                "pooling": self._get_supported_pooling_tasks(task_option),
+        if runner_type == "pooling":
-                "draft": ["draft"]
+            return self._get_supported_pooling_tasks(architectures,
-            }
+                                                     convert_type)
+        if runner_type == "draft":
-    def _get_supported_runner_types(
+            return ["draft"]
-        self,
-        supported_tasks: dict[RunnerType, list[_ResolvedTask]],
-    ) -> set[RunnerType]:
-        return {
-            runner
-            for runner, runner_tasks in supported_tasks.items()
-            if len(runner_tasks) > 0
-        }
-    def _resolve_runner(
-        self,
-        runner_option: RunnerOption,
-        task_option: TaskOption,
-        supported_runner_types: set[RunnerType],
-        supported_tasks: dict[RunnerType, list[_ResolvedTask]],
-    ) -> RunnerType:
-        if not supported_runner_types:
-            raise ValueError("This model does not support any model runners!")
-        if runner_option != "auto":
-            if runner_option not in supported_runner_types:
-                raise ValueError(
-                    f"This model does not support runner={runner_option!r}. "
-                    f"Available runners: {supported_runner_types}")
-            return runner_option
-        if task_option != "auto":
-            for runner, runner_tasks in supported_tasks.items():
-                if task_option in runner_tasks:
-                    return runner
-            else:
-                task_runner: RunnerType = next(
-                    runner for runner, tasks in _RUNNER_TASKS.items()
-                    if task_option in tasks)
-                raise ValueError(
-                    f"This model does not support task={task_option!r}. "
-                    f"Available tasks for runner={task_runner!r}: "
-                    f"{supported_tasks[task_runner]}")
-        if "classify" in supported_tasks.get("pooling", []):
-            # When multiple pooling tasks are present, default to
-            # pooling (eg cross-encoder) for non-standard architectures.
-            return "pooling"
-        suffix_to_preferred_runner: list[tuple[str, RunnerType]] = [
-            ("ForCausalLM", "generate"),
-            ("ForConditionalGeneration", "generate"),
-            ("ChatModel", "generate"),
-            ("LMHeadModel", "generate"),
-            ("EmbeddingModel", "pooling"),
-            ("RewardModel", "pooling"),
-        ]
-        for suffix, pref_runner in suffix_to_preferred_runner:
-            if self.architecture.endswith(
-                    suffix) and pref_runner in supported_runner_types:
-                return pref_runner
-        if "generate" in supported_runner_types:
-            return "generate"
-        if "pooling" in supported_runner_types:
-            return "pooling"
-        raise AssertionError("This line should not be reached")
+        assert_never(runner_type)
    def _parse_quant_hf_config(self):
        quant_cfg = getattr(self.hf_config, "quantization_config", None)
@@ -1216,7 +1304,8 @@ class ModelConfig:
        pipeline_parallel_size = parallel_config.pipeline_parallel_size
        if pipeline_parallel_size > 1:
-            if not self.registry.is_pp_supported_model(self.architectures):
+            if not self.registry.is_pp_supported_model(self.architectures,
+                                                       self):
                raise NotImplementedError(
                    "Pipeline parallelism is not supported for this model. "
                    "Supported models implement the `SupportsPP` interface.")
@@ -1558,16 +1647,40 @@ class ModelConfig:
    @property
    def is_cross_encoder(self) -> bool:
-        return self.task == "classify"
+        return (self._model_info.supports_cross_encoding
+                or self.convert_type == "classify")
    @property
-    def use_mla(self) -> bool:
+    def is_pp_supported(self) -> bool:
-        return self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE
+        return self._model_info.supports_pp
+    @property
+    def is_multimodal_raw_input_supported(self) -> bool:
+        return self._model_info.supports_multimodal_raw_input
+    @property
+    def is_attention_free(self) -> bool:
+        return self._model_info.is_attention_free
+    @property
+    def is_hybrid(self) -> bool:
+        return self._model_info.is_hybrid
+    @property
+    def has_noops(self) -> bool:
+        return self._model_info.has_noops
+    @property
+    def has_inner_state(self):
+        return self._model_info.has_inner_state
    @property
    def is_v1_compatible(self) -> bool:
-        architectures = getattr(self.hf_config, "architectures", [])
+        return not self._model_info.supports_v0_only
-        return me_models.ModelRegistry.is_v1_compatible(architectures)
+    @property
+    def use_mla(self) -> bool:
+        return self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE
    @property
    def is_matryoshka(self) -> bool:
@@ -4769,7 +4882,10 @@ class VllmConfig:
        self.scheduler_config.max_model_len = max_model_len
    def try_verify_and_update_config(self):
-        architecture = getattr(self.model_config, "architecture", None)
+        if self.model_config is None:
+            return
+        architecture = self.model_config.architecture
        if architecture is None:
            return
@@ -4782,7 +4898,7 @@ class VllmConfig:
        if self.model_config.is_hybrid:
            HybridAttentionMambaModelConfig.verify_and_update_config(self)
-        if self.model_config.task == "classify":
+        if self.model_config.convert_type == "classify":
            # Maybe convert ForCausalLM into ForSequenceClassification model.
            from vllm.model_executor.models.adapters import (
                SequenceClassificationConfig)

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -22,14 +22,15 @@ from typing_extensions import TypeIs
 import vllm.envs as envs
 from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
-                         ConfigFormat, ConfigType, DecodingConfig,
+                         ConfigFormat, ConfigType, ConvertOption,
-                         DetailedTraceModules, Device, DeviceConfig,
+                         DecodingConfig, DetailedTraceModules, Device,
-                         DistributedExecutorBackend, GuidedDecodingBackend,
+                         DeviceConfig, DistributedExecutorBackend,
-                         GuidedDecodingBackendV1, HfOverrides, KVEventsConfig,
+                         GuidedDecodingBackend, GuidedDecodingBackendV1,
-                         KVTransferConfig, LoadConfig, LogprobsMode,
+                         HfOverrides, KVEventsConfig, KVTransferConfig,
-                         LoRAConfig, ModelConfig, ModelDType, ModelImpl,
+                         LoadConfig, LogprobsMode, LoRAConfig, ModelConfig,
-                         MultiModalConfig, ObservabilityConfig, ParallelConfig,
+                         ModelDType, ModelImpl, MultiModalConfig,
-                         PoolerConfig, PrefixCachingHashAlgo, SchedulerConfig,
+                         ObservabilityConfig, ParallelConfig, PoolerConfig,
+                         PrefixCachingHashAlgo, RunnerOption, SchedulerConfig,
                         SchedulerPolicy, SpeculativeConfig, TaskOption,
                         TokenizerMode, VllmConfig, get_attr_docs, get_field)
 from vllm.logger import init_logger
@@ -270,7 +271,9 @@ class EngineArgs:
        str, List[str]]] = ModelConfig.served_model_name
    tokenizer: Optional[str] = ModelConfig.tokenizer
    hf_config_path: Optional[str] = ModelConfig.hf_config_path
-    task: TaskOption = ModelConfig.task
+    runner: RunnerOption = ModelConfig.runner
+    convert: ConvertOption = ModelConfig.convert
+    task: Optional[TaskOption] = ModelConfig.task
    skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
    enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
    tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode
@@ -461,7 +464,11 @@ class EngineArgs:
        )
        if not ('serve' in sys.argv[1:] and '--help' in sys.argv[1:]):
            model_group.add_argument("--model", **model_kwargs["model"])
-        model_group.add_argument("--task", **model_kwargs["task"])
+        model_group.add_argument("--runner", **model_kwargs["runner"])
+        model_group.add_argument("--convert", **model_kwargs["convert"])
+        model_group.add_argument("--task",
+                                 **model_kwargs["task"],
+                                 deprecated=True)
        model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
        model_group.add_argument("--tokenizer-mode",
                                 **model_kwargs["tokenizer_mode"])
@@ -870,6 +877,8 @@ class EngineArgs:
        return ModelConfig(
            model=self.model,
            hf_config_path=self.hf_config_path,
+            runner=self.runner,
+            convert=self.convert,
            task=self.task,
            tokenizer=self.tokenizer,
            tokenizer_mode=self.tokenizer_mode,

--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -20,8 +20,8 @@ from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
                              create_sort_beams_key_function)
 from vllm.config import (CompilationConfig, ModelDType, TokenizerMode,
                         is_init_field)
-from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
+from vllm.engine.arg_utils import (ConvertOption, EngineArgs, HfOverrides,
-                                   TaskOption)
+                                   PoolerConfig, RunnerOption)
 from vllm.engine.llm_engine import LLMEngine
 from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
                                         ChatTemplateContentFormatOption,
@@ -170,7 +170,8 @@ class LLM:
        self,
        model: str,
        *,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
+        convert: ConvertOption = "auto",
        tokenizer: Optional[str] = None,
        tokenizer_mode: TokenizerMode = "auto",
        skip_tokenizer_init: bool = False,
@@ -244,7 +245,8 @@ class LLM:
        engine_args = EngineArgs(
            model=model,
-            task=task,
+            runner=runner,
+            convert=convert,
            tokenizer=tokenizer,
            tokenizer_mode=tokenizer_mode,
            skip_tokenizer_init=skip_tokenizer_init,
@@ -459,18 +461,10 @@ class LLM:
        model_config = self.llm_engine.model_config
        runner_type = model_config.runner_type
        if runner_type != "generate":
-            messages = [
+            raise ValueError(
-                "LLM.generate() is only supported for generative models."
+                "LLM.generate() is only supported for generative models. "
-            ]
+                "Try passing `--runner generate` to use the model as a "
+                "generative model.")
-            if "generate" in model_config.supported_runner_types:
-                messages.append(
-                    "Your model supports the 'generate' runner, but is "
-                    f"currently initialized for the '{runner_type}' runner. "
-                    "Please initialize vLLM using `--task generate` or "
-                    "`--task transcription`.")
-            raise ValueError(" ".join(messages))
        if prompt_token_ids is not None:
            parsed_prompts = self._convert_v1_inputs(
@@ -497,7 +491,8 @@ class LLM:
        truncate_prompt_tokens = None
        if isinstance(sampling_params, SamplingParams):
            truncate_prompt_tokens = sampling_params.truncate_prompt_tokens
-        _validate_truncation_size(self.llm_engine.model_config.max_model_len,
+        _validate_truncation_size(model_config.max_model_len,
                                  truncate_prompt_tokens, tokenization_kwargs)
        # Add any modality specific loras to the corresponding prompts
@@ -1100,16 +1095,10 @@ class LLM:
        model_config = self.llm_engine.model_config
        runner_type = model_config.runner_type
        if runner_type != "pooling":
-            messages = ["LLM.encode() is only supported for pooling models."]
+            raise ValueError(
+                "LLM.encode() is only supported for pooling models. "
-            if "pooling" in model_config.supported_runner_types:
+                "Try passing `--runner pooling` to use the model as a "
-                messages.append(
+                "pooling model.")
-                    "Your model supports the 'pooling' runner, but is "
-                    f"currently initialized for the '{runner_type}' runner. "
-                    "Please initialize vLLM using `--task embed`, "
-                    "`--task classify`, `--task score` etc.")
-            raise ValueError(" ".join(messages))
        if prompt_token_ids is not None:
            parsed_prompts = self._convert_v1_inputs(
@@ -1183,8 +1172,9 @@ class LLM:
            embedding vectors in the same order as the input prompts.
        """
        if "embed" not in self.supported_tasks:
-            raise ValueError("Embedding API is not supported by this model. "
+            raise ValueError(
-                             "Please set `--task embed`.")
+                "Embedding API is not supported by this model. "
+                "Try converting the model using `--convert embed`.")
        items = self.encode(
            prompts,
@@ -1229,7 +1219,7 @@ class LLM:
        if "classify" not in self.supported_tasks:
            raise ValueError(
                "Classification API is not supported by this model. "
-                "Please set `--task classify`.")
+                "Try converting the model using `--convert classify`.")
        items = self.encode(
            prompts,
@@ -1283,27 +1273,26 @@ class LLM:
        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
    ) -> list[ScoringRequestOutput]:
+        model_config = self.llm_engine.model_config
        if isinstance(tokenizer, MistralTokenizer):
            raise ValueError(
-                "Score API is only enabled for `--task embed or score`")
+                "Score API is not supported for Mistral tokenizer")
        if len(data_1) == 1:
            data_1 = data_1 * len(data_2)
        pooling_params = PoolingParams(task="score")
        tokenization_kwargs: dict[str, Any] = {}
-        _validate_truncation_size(self.llm_engine.model_config.max_model_len,
+        _validate_truncation_size(model_config.max_model_len,
                                  truncate_prompt_tokens, tokenization_kwargs)
        parsed_prompts = []
        input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
-        if self.llm_engine.model_config.is_multimodal_model:
+        if model_config.is_multimodal_model:
-            model_config = self.llm_engine.model_config
            for q, d in input_pairs:
                _, engine_prompt = get_score_prompt(
                    model_config=model_config,
@@ -1314,11 +1303,9 @@ class LLM:
                )
                parsed_prompts.append(engine_prompt)
        else:
            for q, t in input_pairs:
-                if self.llm_engine.model_config.use_pad_token:
+                if model_config.use_pad_token:
                    # cross_encoder models defaults to using pad_token.
                    prompt_inputs = tokenizer(
                        text=q,  # type: ignore[arg-type]
@@ -1396,23 +1383,18 @@ class LLM:
        model_config = self.llm_engine.model_config
        runner_type = model_config.runner_type
        if runner_type != "pooling":
-            messages = ["LLM.score() is only supported for pooling models."]
+            raise ValueError(
+                "LLM.score() is only supported for pooling models. "
-            if "pooling" in model_config.supported_runner_types:
+                "Try passing `--runner pooling` to use the model as a "
-                messages.append(
+                "pooling model.")
-                    "Your model supports the 'pooling' runner, but is "
-                    f"currently initialized for the '{runner_type}' runner. "
-                    "Please initialize vLLM using `--task embed`, "
-                    "`--task classify`, `--task score` etc.")
-            raise ValueError(" ".join(messages))
        supported_tasks = self.supported_tasks
        if all(t not in supported_tasks for t in ("embed", "classify")):
            raise ValueError("Score API is not supported by this model. "
-                             "Please set `--task embed` or `--task classify`.")
+                             "Try converting the model using "
+                             "`--convert embed` or `--convert classify`.")
-        if (model_config.task == "classify"
+        if (model_config.is_cross_encoder
                and getattr(model_config.hf_config, "num_labels", 0) != 1):
            raise ValueError("Score API is only enabled for num_labels == 1.")
@@ -1421,15 +1403,14 @@ class LLM:
        # lists of tokens to the `text` and `text_pair` kwargs
        tokenizer = self.get_tokenizer()
-        if not self.llm_engine.model_config.is_multimodal_model:
+        if not model_config.is_multimodal_model:
            def check_data_type(data: Union[SingletonPrompt,
                                            Sequence[SingletonPrompt],
                                            ScoreMultiModalParam]):
                if isinstance(data, dict) and "content" in data:
-                    raise ValueError(
+                    raise ValueError("ScoreMultiModalParam is not supported "
-                        f"ScoreMultiModalParam is not supported for {self.llm_engine.model_config.architecture}",  # noqa: E501
+                                     f"for {model_config.architecture}")
-                    )
            check_data_type(data_1)
            check_data_type(data_2)
@@ -1471,7 +1452,7 @@ class LLM:
        _validate_score_input_lens(data_1, data_2)  # type: ignore[arg-type]
-        if self.llm_engine.model_config.is_cross_encoder:
+        if model_config.is_cross_encoder:
            return self._cross_encoding_score(
                tokenizer,
                data_1,  # type: ignore[arg-type]

--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1734,7 +1734,6 @@ async def init_app_state(
        state.openai_serving_models,
        request_logger=request_logger,
    ) if "transcription" in supported_tasks else None
-    state.task = model_config.task
    state.enable_server_load_tracking = args.enable_server_load_tracking
    state.server_load_metrics = 0

--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -9,9 +9,8 @@ from dataclasses import dataclass, field
 from typing import Optional
 import torch
-import transformers
 from torch import nn
-from transformers.dynamic_module_utils import get_class_from_dynamic_module
+from typing_extensions import assert_never
 from vllm.attention import Attention
 from vllm.config import (ModelConfig, ModelImpl, VllmConfig,
@@ -20,13 +19,10 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import QKVCrossParallelLinear
 from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig, QuantizeMethodBase)
-from vllm.model_executor.models import ModelRegistry
 from vllm.model_executor.models.adapters import (as_embedding_model,
                                                 as_reward_model,
                                                 as_seq_cls_model)
 from vllm.model_executor.models.interfaces import SupportsQuant
-from vllm.model_executor.models.registry import (_PREVIOUSLY_SUPPORTED_MODELS,
-                                                 _TRANSFORMERS_BACKEND_MODELS)
 from vllm.utils import is_pin_memory_available
 logger = init_logger(__name__)
@@ -169,61 +165,6 @@ def device_loading_context(module: torch.nn.Module,
        # New parameters or parameters already on target device are untouched
-def resolve_transformers_arch(model_config: ModelConfig,
-                              architectures: list[str]):
-    if model_config.model_impl == ModelImpl.VLLM:
-        raise ValueError(
-            "Attempting to resolve architecture from the Transformers library "
-            "but the model implementation is set to vLLM. This should never "
-            "happen.")
-    for i, arch in enumerate(architectures):
-        if arch in _TRANSFORMERS_BACKEND_MODELS:
-            continue
-        if model_config.model_impl == ModelImpl.AUTO:
-            logger.warning(
-                "%s has no vLLM implementation, falling back to Transformers "
-                "implementation. Some features may not be supported and "
-                "performance may not be optimal.", arch)
-        auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
-                                           None) or dict()
-        # Make sure that config class is always initialized before model class,
-        # otherwise the model class won't be able to access the config class,
-        # the expected auto_map should have correct order like:
-        # "auto_map": {
-        #     "AutoConfig": "<your-repo-name>--<config-name>",
-        #     "AutoModel": "<your-repo-name>--<config-name>",
-        #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
-        # },
-        auto_modules = {
-            name:
-            get_class_from_dynamic_module(module,
-                                          model_config.model,
-                                          revision=model_config.revision)
-            for name, module in sorted(auto_map.items(), key=lambda x: x[0])
-        }
-        model_module = getattr(transformers, arch, None)
-        if model_module is None:
-            if "AutoModel" not in auto_map:
-                raise ValueError(
-                    f"Cannot find model module. '{arch}' is not a registered "
-                    "model in the Transformers library (only relevant if the "
-                    "model is meant to be in Transformers) and 'AutoModel' is "
-                    "not present in the model config's 'auto_map' (relevant "
-                    "if the model is custom).")
-            model_module = auto_modules["AutoModel"]
-        if not model_module.is_backend_compatible():
-            raise ValueError(
-                f"The Transformers implementation of '{arch}' is not "
-                "compatible with vLLM.")
-        architectures[i] = model_config._get_transformers_backend_cls()
-    return architectures
 def get_model_architecture(
        model_config: ModelConfig) -> tuple[type[nn.Module], str]:
    architectures = getattr(model_config.hf_config, "architectures", [])
@@ -239,56 +180,38 @@ def get_model_architecture(
        "bitsandbytes",
    ]
-    vllm_supported_archs = ModelRegistry.get_supported_archs()
+    if (model_config.quantization is not None
-    is_supported = lambda arch: (arch in vllm_supported_archs and arch not in
+            and model_config.quantization not in mixtral_supported
-                                 _TRANSFORMERS_BACKEND_MODELS)
+            and "MixtralForCausalLM" in architectures):
-    vllm_not_supported = not any(is_supported(arch) for arch in architectures)
+        architectures = ["QuantMixtralForCausalLM"]
-    if vllm_not_supported:
-        # try automatic conversion in adapters.py
-        for arch in architectures:
-            if not arch.endswith("ForSequenceClassification"):
-                continue
-            assert model_config.task == "classify"
+    model_cls, arch = model_config.registry.resolve_model_cls(
-            causal_lm_arch = arch.replace("ForSequenceClassification",
+        architectures,
-                                          "ForCausalLM")
+        model_config=model_config,
-            causal_lm_arch_vllm_supported = (causal_lm_arch
+    )
-                                             in vllm_supported_archs)
-            if not causal_lm_arch_vllm_supported:
-                continue
-            architectures = [causal_lm_arch]
+    if arch == model_config._get_transformers_backend_cls():
-            vllm_not_supported = False
+        assert model_config.model_impl != ModelImpl.VLLM
-            break
+        if model_config.model_impl == ModelImpl.AUTO:
+            logger.warning_once(
-    if any(arch in _PREVIOUSLY_SUPPORTED_MODELS for arch in architectures):
+                "%s has no vLLM implementation, falling back to Transformers "
-        previous_version = _PREVIOUSLY_SUPPORTED_MODELS[architectures[0]]
+                "implementation. Some features may not be supported and "
-        raise ValueError(
+                "performance may not be optimal.", arch)
-            f"Model architecture {architectures[0]} was supported"
-            f" in vLLM until version {previous_version}, and is "
-            "not supported anymore. Please use an older version"
-            " of vLLM if you want to use this model architecture.")
-    if (model_config.model_impl == ModelImpl.TRANSFORMERS or
-            model_config.model_impl == ModelImpl.AUTO and vllm_not_supported):
-        architectures = resolve_transformers_arch(model_config, architectures)
-        logger.debug_once("Resolve transformers arch %s", str(architectures))
-    elif (model_config.quantization is not None
-          and model_config.quantization not in mixtral_supported
-          and "MixtralForCausalLM" in architectures):
-        architectures = ["QuantMixtralForCausalLM"]
-    model_cls, arch = ModelRegistry.resolve_model_cls(architectures)
+    convert_type = model_config.convert_type
-    if model_config.task == "embed":
+    if convert_type == "none":
-        logger.debug_once("Automatic conversion using `as_embedding_model`.")
+        pass
+    elif convert_type == "embed":
+        logger.debug_once("Converting to embedding model.")
        model_cls = as_embedding_model(model_cls)
-    elif model_config.task == "classify":
+    elif convert_type == "classify":
-        logger.debug_once("Automatic conversion using `as_seq_cls_model`.")
+        logger.debug_once("Converting to sequence classification model.")
        model_cls = as_seq_cls_model(model_cls)
-    elif model_config.task == "reward":
+    elif convert_type == "reward":
-        logger.debug_once("Automatic conversion using `as_reward_model`.")
+        logger.debug_once("Converting to reward model.")
        model_cls = as_reward_model(model_cls)
+    else:
+        assert_never(convert_type)
    return model_cls, arch

--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -253,8 +253,10 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
            dtype=kv_cache_dtype,
            use_mla=model_config.use_mla).page_size_bytes
-        model_cls = ModelRegistry.resolve_model_cls(
+        model_cls, _ = ModelRegistry.resolve_model_cls(
-            model_config._model_info.architecture)[0]
+            model_config.architecture,
+            model_config=model_config,
+        )
        # get mamba page size
        mamba_page_size = MambaSpec(

--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -12,19 +12,24 @@ import sys
 import tempfile
 from abc import ABC, abstractmethod
 from collections.abc import Set
-from dataclasses import asdict, dataclass, field
+from dataclasses import dataclass, field
 from functools import lru_cache
 from typing import Callable, Optional, TypeVar, Union
 import torch.nn as nn
+import transformers
+from vllm.config import (ModelConfig, ModelImpl, iter_architecture_defaults,
+                         try_match_architecture_defaults)
 from vllm.logger import init_logger
+from vllm.transformers_utils.dynamic_module import (
+    try_get_class_from_dynamic_module)
 from .interfaces import (has_inner_state, has_noops, is_attention_free,
                         is_hybrid, supports_cross_encoding,
                         supports_multimodal, supports_multimodal_raw_input,
                         supports_pp, supports_transcription, supports_v0_only)
-from .interfaces_base import is_text_generation_model
+from .interfaces_base import is_pooling_model, is_text_generation_model
 logger = init_logger(__name__)
@@ -311,7 +316,7 @@ class _ModelInfo:
        return _ModelInfo(
            architecture=model.__name__,
            is_text_generation_model=is_text_generation_model(model),
-            is_pooling_model=True,  # Can convert any model into a pooling model
+            is_pooling_model=is_pooling_model(model),
            supports_cross_encoding=supports_cross_encoding(model),
            supports_multimodal=supports_multimodal(model),
            supports_multimodal_raw_input=supports_multimodal_raw_input(model),
@@ -465,6 +470,16 @@ class _ModelRegistry:
                f"Model architectures {architectures} failed "
                "to be inspected. Please check the logs for more details.")
+        for arch in architectures:
+            if arch in _PREVIOUSLY_SUPPORTED_MODELS:
+                previous_version = _PREVIOUSLY_SUPPORTED_MODELS[arch]
+                raise ValueError(
+                    f"Model architecture {arch} was supported in vLLM until "
+                    f"v{previous_version}, and is not supported anymore. "
+                    "Please use an older version of vLLM if you want to "
+                    "use this model architecture.")
        raise ValueError(
            f"Model architectures {architectures} are not supported for now. "
            f"Supported architectures: {all_supported_archs}")
@@ -477,174 +492,284 @@ class _ModelRegistry:
        return _try_load_model_cls(model_arch, self.models[model_arch])
    def _try_inspect_model_cls(self, model_arch: str) -> Optional[_ModelInfo]:
-        if model_arch in self.models:
+        if model_arch not in self.models:
-            return _try_inspect_model_cls(model_arch, self.models[model_arch])
+            return None
-        if model_arch.endswith("ForSequenceClassification"):
+        return _try_inspect_model_cls(model_arch, self.models[model_arch])
-            causal_lm_arch = model_arch.replace("ForSequenceClassification",
-                                                "ForCausalLM")
+    def _try_resolve_transformers(
-            if causal_lm_arch not in self.models:
+        self,
+        architecture: str,
+        model_config: ModelConfig,
+    ) -> Optional[str]:
+        if architecture in _TRANSFORMERS_BACKEND_MODELS:
+            return architecture
+        auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
+                                           None) or dict()
+        # Make sure that config class is always initialized before model class,
+        # otherwise the model class won't be able to access the config class,
+        # the expected auto_map should have correct order like:
+        # "auto_map": {
+        #     "AutoConfig": "<your-repo-name>--<config-name>",
+        #     "AutoModel": "<your-repo-name>--<config-name>",
+        #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
+        # },
+        for prefix in ("AutoConfig", "AutoModel"):
+            for name, module in auto_map.items():
+                if name.startswith(prefix):
+                    try_get_class_from_dynamic_module(
+                        module,
+                        model_config.model,
+                        revision=model_config.revision,
+                        warn_on_fail=False,
+                    )
+        model_module = getattr(transformers, architecture, None)
+        if model_module is None:
+            for name, module in auto_map.items():
+                if name.startswith("AutoModel"):
+                    model_module = try_get_class_from_dynamic_module(
+                        module,
+                        model_config.model,
+                        revision=model_config.revision,
+                        warn_on_fail=True,
+                    )
+                    if model_module is not None:
+                        break
+            else:
+                if model_config.model_impl != ModelImpl.TRANSFORMERS:
+                    return None
+                raise ValueError(
+                    f"Cannot find model module. {architecture!r} is not a "
+                    "registered model in the Transformers library (only "
+                    "relevant if the model is meant to be in Transformers) "
+                    "and 'AutoModel' is not present in the model config's "
+                    "'auto_map' (relevant if the model is custom).")
+        if not model_module.is_backend_compatible():
+            if model_config.model_impl != ModelImpl.TRANSFORMERS:
                return None
-            info = _try_inspect_model_cls(causal_lm_arch,
+            raise ValueError(
-                                          self.models[causal_lm_arch])
+                f"The Transformers implementation of {architecture!r} "
+                "is not compatible with vLLM.")
-            info = _ModelInfo(**dict(
+        return model_config._get_transformers_backend_cls()
-                asdict(info), **{
-                    "architecture": model_arch,
-                    "supports_cross_encoding": True
-                }))
-            return info
-        return None
+    def _normalize_arch(
+        self,
+        architecture: str,
+        model_config: ModelConfig,
+    ) -> str:
+        if architecture in self.models:
+            return architecture
+        # This may be called in order to resolve runner_type and convert_type
+        # in the first place, in which case we consider the default match
+        match = try_match_architecture_defaults(
+            architecture,
+            runner_type=getattr(model_config, "runner_type", None),
+            convert_type=getattr(model_config, "convert_type", None),
+        )
+        if match:
+            suffix, _ = match
+            # Get the name of the base model to convert
+            for repl_suffix, _ in iter_architecture_defaults():
+                base_arch = architecture.replace(suffix, repl_suffix)
+                if base_arch in self.models:
+                    return base_arch
+        return architecture
    def _normalize_archs(
        self,
-        architectures: Union[str, list[str]],
+        architectures: list[str],
+        model_config: ModelConfig,
    ) -> list[str]:
-        if isinstance(architectures, str):
-            architectures = [architectures]
        if not architectures:
            logger.warning("No model architectures are specified")
-        # filter out support architectures
+        return [
-        normalized_arch = list(
+            self._normalize_arch(arch, model_config) for arch in architectures
-            filter(lambda model: model in self.models, architectures))
+        ]
-        # try automatic conversion in adapters.py
-        for arch in architectures:
-            if not arch.endswith("ForSequenceClassification"):
-                continue
-            causal_lm_arch = arch.replace("ForSequenceClassification",
-                                          "ForCausalLM")
-            if causal_lm_arch in self.models:
-                normalized_arch.append(arch)
-        # NOTE(Isotr0py): Be careful of architectures' order!
-        # Make sure Transformers backend architecture is at the end of the
-        # list, otherwise pooling models automatic conversion will fail!
-        for arch in normalized_arch:
-            if arch.startswith("TransformersFor"):
-                normalized_arch.remove(arch)
-                normalized_arch.append(arch)
-        return normalized_arch
    def inspect_model_cls(
        self,
        architectures: Union[str, list[str]],
+        model_config: ModelConfig,
    ) -> tuple[_ModelInfo, str]:
-        architectures = self._normalize_archs(architectures)
+        if isinstance(architectures, str):
+            architectures = [architectures]
-        for arch in architectures:
+        normalized_archs = self._normalize_archs(architectures, model_config)
-            model_info = self._try_inspect_model_cls(arch)
+        # Require transformers impl
+        if model_config.model_impl == ModelImpl.TRANSFORMERS:
+            arch = self._try_resolve_transformers(architectures[0],
+                                                  model_config)
+            if arch is not None:
+                model_info = self._try_inspect_model_cls(arch)
+                if model_info is not None:
+                    return (model_info, arch)
+        for arch, normalized_arch in zip(architectures, normalized_archs):
+            model_info = self._try_inspect_model_cls(normalized_arch)
            if model_info is not None:
                return (model_info, arch)
+        # Fallback to transformers impl
+        if model_config.model_impl in (ModelImpl.AUTO, ModelImpl.TRANSFORMERS):
+            arch = self._try_resolve_transformers(architectures[0],
+                                                  model_config)
+            if arch is not None:
+                model_info = self._try_inspect_model_cls(arch)
+                if model_info is not None:
+                    return (model_info, arch)
        return self._raise_for_unsupported(architectures)
    def resolve_model_cls(
        self,
        architectures: Union[str, list[str]],
+        model_config: ModelConfig,
    ) -> tuple[type[nn.Module], str]:
-        architectures = self._normalize_archs(architectures)
+        if isinstance(architectures, str):
+            architectures = [architectures]
-        for arch in architectures:
+        normalized_archs = self._normalize_archs(architectures, model_config)
-            model_cls = self._try_load_model_cls(arch)
+        # Require transformers impl
+        if model_config.model_impl == ModelImpl.TRANSFORMERS:
+            arch = self._try_resolve_transformers(architectures[0],
+                                                  model_config)
+            if arch is not None:
+                model_cls = self._try_load_model_cls(arch)
+                if model_cls is not None:
+                    return (model_cls, arch)
+        for arch, normalized_arch in zip(architectures, normalized_archs):
+            model_cls = self._try_load_model_cls(normalized_arch)
            if model_cls is not None:
                return (model_cls, arch)
+        # Fallback to transformers impl
+        if model_config.model_impl in (ModelImpl.AUTO, ModelImpl.TRANSFORMERS):
+            arch = self._try_resolve_transformers(architectures[0],
+                                                  model_config)
+            if arch is not None:
+                model_cls = self._try_load_model_cls(arch)
+                if model_cls is not None:
+                    return (model_cls, arch)
        return self._raise_for_unsupported(architectures)
    def is_text_generation_model(
        self,
        architectures: Union[str, list[str]],
+        model_config: ModelConfig,
    ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
        return model_cls.is_text_generation_model
    def is_pooling_model(
        self,
        architectures: Union[str, list[str]],
+        model_config: ModelConfig,
    ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
        return model_cls.is_pooling_model
    def is_cross_encoder_model(
        self,
        architectures: Union[str, list[str]],
+        model_config: ModelConfig,
    ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
        return model_cls.supports_cross_encoding
    def is_multimodal_model(
        self,
        architectures: Union[str, list[str]],
+        model_config: ModelConfig,
    ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
        return model_cls.supports_multimodal
    def supports_multimodal_raw_input(
        self,
        architectures: Union[str, list[str]],
+        model_config: ModelConfig,
    ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
        return model_cls.supports_multimodal_raw_input
    def is_pp_supported_model(
        self,
        architectures: Union[str, list[str]],
+        model_config: ModelConfig,
    ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
        return model_cls.supports_pp
    def model_has_inner_state(
        self,
        architectures: Union[str, list[str]],
+        model_config: ModelConfig,
    ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
        return model_cls.has_inner_state
    def is_attention_free_model(
        self,
        architectures: Union[str, list[str]],
+        model_config: ModelConfig,
    ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
        return model_cls.is_attention_free
    def is_hybrid_model(
        self,
        architectures: Union[str, list[str]],
+        model_config: ModelConfig,
    ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
        return model_cls.is_hybrid
    def is_noops_model(
        self,
        architectures: Union[str, list[str]],
+        model_config: ModelConfig,
    ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
        return model_cls.has_noops
    def is_transcription_model(
        self,
        architectures: Union[str, list[str]],
+        model_config: ModelConfig,
    ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
        return model_cls.supports_transcription
    def is_transcription_only_model(
        self,
        architectures: Union[str, list[str]],
+        model_config: ModelConfig,
    ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
        return model_cls.supports_transcription_only
    def is_v1_compatible(
        self,
        architectures: Union[str, list[str]],
+        model_config: ModelConfig,
    ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
        return not model_cls.supports_v0_only

--- a/vllm/transformers_utils/dynamic_module.py
+++ b/vllm/transformers_utils/dynamic_module.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from typing import Optional, Union
+from transformers.dynamic_module_utils import get_class_from_dynamic_module
+import vllm.envs as envs
+from vllm.logger import init_logger
+logger = init_logger(__name__)
+def try_get_class_from_dynamic_module(
+    class_reference: str,
+    pretrained_model_name_or_path: str,
+    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    force_download: bool = False,
+    resume_download: Optional[bool] = None,
+    proxies: Optional[dict[str, str]] = None,
+    token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+    repo_type: Optional[str] = None,
+    code_revision: Optional[str] = None,
+    warn_on_fail: bool = True,
+    **kwargs,
+) -> Optional[type]:
+    """
+    As [transformers.dynamic_module_utils.get_class_from_dynamic_module][],
+    but ignoring any errors.
+    """
+    try:
+        return get_class_from_dynamic_module(
+            class_reference,
+            pretrained_model_name_or_path,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            resume_download=resume_download,
+            proxies=proxies,
+            token=token,
+            revision=revision,
+            local_files_only=local_files_only,
+            repo_type=repo_type,
+            code_revision=code_revision,
+            **kwargs,
+        )
+    except Exception:
+        location = "ModelScope" if envs.VLLM_USE_MODELSCOPE else "HF Hub"
+        if warn_on_fail:
+            logger.warning(
+                "Unable to load %s from %s on %s.",
+                class_reference,
+                pretrained_model_name_or_path,
+                location,
+                exc_info=True,
+            )
+        return None
--- a/vllm/transformers_utils/tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group.py
@@ -3,6 +3,8 @@
 from typing import Optional
+from typing_extensions import assert_never
 from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens,
@@ -108,6 +110,14 @@ class TokenizerGroup:
 def init_tokenizer_from_configs(model_config: ModelConfig,
                                scheduler_config: SchedulerConfig,
                                lora_config: Optional[LoRAConfig]):
+    runner_type = model_config.runner_type
+    if runner_type == "generate" or runner_type == "draft":
+        truncation_side = "left"
+    elif runner_type == "pooling":
+        truncation_side = "right"
+    else:
+        assert_never(runner_type)
    return TokenizerGroup(
        tokenizer_id=model_config.tokenizer,
        enable_lora=bool(lora_config),
@@ -117,4 +127,4 @@ def init_tokenizer_from_configs(model_config: ModelConfig,
        tokenizer_mode=model_config.tokenizer_mode,
        trust_remote_code=model_config.trust_remote_code,
        revision=model_config.tokenizer_revision,
-        truncation_side=model_config.truncation_side)
+        truncation_side=truncation_side)
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -127,8 +127,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
        self.is_multimodal_model = model_config.is_multimodal_model
        self.is_pooling_model = model_config.pooler_config is not None
        self.is_encoder_only_model = False
-        self.model_supports_multimodal_raw_input = (
+        self.is_multimodal_raw_input_supported = (
-            model_config.model_supports_multimodal_raw_input)
+            model_config.is_multimodal_raw_input_supported)
        self.max_model_len = model_config.max_model_len
        self.max_num_tokens = scheduler_config.max_num_batched_tokens
        self.max_num_reqs = scheduler_config.max_num_seqs
@@ -583,7 +583,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
    ) -> dict[str, Any]:
        model_kwargs: dict[str, Any] = {}
-        if self.model_supports_multimodal_raw_input:
+        if self.is_multimodal_raw_input_supported:
            # This model requires the raw multimodal data in input.
            if scheduler_output:
                multi_modal_kwargs_list = []