Unverified Commit 86ae693f authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Deprecation][2/N] Replace `--task` with `--runner` and `--convert` (#21470)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 8f605ee3
...@@ -24,13 +24,8 @@ eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B" ...@@ -24,13 +24,8 @@ eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
def _create_proposer(method: str, k: int) -> EagleProposer: def _create_proposer(method: str, k: int) -> EagleProposer:
model_config = ModelConfig(model=model_dir, model_config = ModelConfig(model=model_dir,
task="generate", runner="generate",
max_model_len=100, max_model_len=100)
tokenizer=model_dir,
tokenizer_mode="auto",
dtype="auto",
seed=None,
trust_remote_code=False)
# Choose model directory based on method # Choose model directory based on method
draft_model_dir = eagle_dir if method == "eagle" else eagle3_dir draft_model_dir = eagle_dir if method == "eagle" else eagle3_dir
......
...@@ -44,14 +44,7 @@ def test_ngram_proposer(): ...@@ -44,14 +44,7 @@ def test_ngram_proposer():
def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
# Dummy model config. Just to set max_model_len. # Dummy model config. Just to set max_model_len.
model_config = ModelConfig(model="facebook/opt-125m", model_config = ModelConfig(model="facebook/opt-125m")
task="generate",
max_model_len=100,
tokenizer="facebook/opt-125m",
tokenizer_mode="auto",
dtype="auto",
seed=None,
trust_remote_code=False)
return NgramProposer( return NgramProposer(
vllm_config=VllmConfig(model_config=model_config, vllm_config=VllmConfig(model_config=model_config,
speculative_config=SpeculativeConfig. speculative_config=SpeculativeConfig.
......
...@@ -26,10 +26,6 @@ def get_vllm_config(): ...@@ -26,10 +26,6 @@ def get_vllm_config():
) )
model_config = ModelConfig( model_config = ModelConfig(
model="facebook/opt-125m", model="facebook/opt-125m",
task="generate",
tokenizer="facebook/opt-125m",
tokenizer_mode="auto",
trust_remote_code=True,
dtype="bfloat16", # TPUs typically use bfloat16 dtype="bfloat16", # TPUs typically use bfloat16
seed=42, seed=42,
) )
......
...@@ -76,10 +76,6 @@ def get_vllm_config(): ...@@ -76,10 +76,6 @@ def get_vllm_config():
) )
model_config = ModelConfig( model_config = ModelConfig(
model="facebook/opt-125m", model="facebook/opt-125m",
task="generate",
tokenizer="facebook/opt-125m",
tokenizer_mode="auto",
trust_remote_code=True,
dtype="float16", dtype="float16",
seed=42, seed=42,
) )
......
...@@ -26,7 +26,7 @@ from pydantic import (ConfigDict, SkipValidation, TypeAdapter, field_validator, ...@@ -26,7 +26,7 @@ from pydantic import (ConfigDict, SkipValidation, TypeAdapter, field_validator,
from pydantic.dataclasses import dataclass from pydantic.dataclasses import dataclass
from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
from torch.distributed import ProcessGroup, ReduceOp from torch.distributed import ProcessGroup, ReduceOp
from typing_extensions import Self, runtime_checkable from typing_extensions import Self, assert_never, runtime_checkable
import vllm.envs as envs import vllm.envs as envs
from vllm import version from vllm import version
...@@ -102,12 +102,63 @@ RunnerOption = Literal["auto", "generate", "pooling", "draft"] ...@@ -102,12 +102,63 @@ RunnerOption = Literal["auto", "generate", "pooling", "draft"]
RunnerType = Literal["generate", "pooling", "draft"] RunnerType = Literal["generate", "pooling", "draft"]
_RUNNER_TASKS: dict[RunnerType, list[_ResolvedTask]] = { ConvertOption = Literal["auto", "none", "embed", "classify", "reward"]
ConvertType = Literal["none", "embed", "classify", "reward"]
_RUNNER_TASKS: dict[RunnerType, list[TaskOption]] = {
"generate": ["generate", "transcription"], "generate": ["generate", "transcription"],
"pooling": ["encode", "embed", "classify", "reward"], "pooling": ["embedding", "embed", "classify", "score", "reward"],
"draft": ["draft"],
}
_RUNNER_CONVERTS: dict[RunnerType, list[ConvertType]] = {
"generate": [],
"pooling": ["embed", "classify", "reward"],
"draft": [], "draft": [],
} }
# Some model suffixes are based on auto classes from Transformers:
# https://huggingface.co/docs/transformers/en/model_doc/auto
# NOTE: Items higher on this list priority over lower ones
_SUFFIX_TO_DEFAULTS: list[tuple[str, tuple[RunnerType, ConvertType]]] = [
("ForCausalLM", ("generate", "none")),
("ForConditionalGeneration", ("generate", "none")),
("ChatModel", ("generate", "none")),
("LMHeadModel", ("generate", "none")),
("ForTextEncoding", ("pooling", "embed")),
("EmbeddingModel", ("pooling", "embed")),
("ForSequenceClassification", ("pooling", "classify")),
("ForAudioClassification", ("pooling", "classify")),
("ForImageClassification", ("pooling", "classify")),
("ForVideoClassification", ("pooling", "classify")),
("ClassificationModel", ("pooling", "classify")),
("ForRewardModeling", ("pooling", "reward")),
("RewardModel", ("pooling", "reward")),
# Let other `*Model`s take priority
("Model", ("pooling", "embed")),
]
def iter_architecture_defaults():
yield from _SUFFIX_TO_DEFAULTS
def try_match_architecture_defaults(
architecture: str,
*,
runner_type: Optional[RunnerType] = None,
convert_type: Optional[ConvertType] = None,
) -> Optional[tuple[str, tuple[RunnerType, ConvertType]]]:
for suffix, (default_runner_type,
default_convert_type) in iter_architecture_defaults():
if ((runner_type is None or runner_type == default_runner_type) and
(convert_type is None or convert_type == default_convert_type)
and architecture.endswith(suffix)):
return suffix, (default_runner_type, default_convert_type)
return None
@runtime_checkable @runtime_checkable
class SupportsHash(Protocol): class SupportsHash(Protocol):
...@@ -236,11 +287,16 @@ class ModelConfig: ...@@ -236,11 +287,16 @@ class ModelConfig:
runner: RunnerOption = "auto" runner: RunnerOption = "auto"
"""The type of model runner to use. Each vLLM instance only supports one """The type of model runner to use. Each vLLM instance only supports one
model runner, even if the same model can be used for multiple types.""" model runner, even if the same model can be used for multiple types."""
task: TaskOption = "auto" convert: ConvertOption = "auto"
"""The task to use the model for. If the model supports more than one """Convert the model using adapters defined in
model runner, this is used to select which model runner to run. [vllm.model_executor.models.adapters][]. The most common use case is to
adapt a text generation model to be used for pooling tasks."""
Note that the model may support other tasks using the same model runner.""" task: Optional[TaskOption] = None
"""[DEPRECATED] The task to use the model for. If the model supports more
than one model runner, this is used to select which model runner to run.
Note that the model may support other tasks using the same model runner.
"""
tokenizer: SkipValidation[str] = None # type: ignore tokenizer: SkipValidation[str] = None # type: ignore
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model """Name or path of the Hugging Face tokenizer to use. If unspecified, model
name or path will be used.""" name or path will be used."""
...@@ -558,48 +614,103 @@ class ModelConfig: ...@@ -558,48 +614,103 @@ class ModelConfig:
self.hf_image_processor_config = get_hf_image_processor_config( self.hf_image_processor_config = get_hf_image_processor_config(
self.model, hf_token=self.hf_token, revision=self.revision) self.model, hf_token=self.hf_token, revision=self.revision)
# For pooling models, self.task is used to indicate the architectures = self.architectures
# user-selected task registry = self.registry
if self.task == "score": is_generative_model = registry.is_text_generation_model(
if self._is_classify_task(self.architectures): architectures, self)
self.task = "classify" is_pooling_model = registry.is_pooling_model(architectures, self)
def _task_to_convert(task: TaskOption) -> ConvertType:
if task == "embedding" or task == "embed":
return "embed"
if task == "classify":
return "classify"
if task == "reward":
return "reward"
if task == "score":
new_task = self._get_default_pooling_task(architectures)
return "classify" if new_task == "classify" else "embed"
return "none"
if self.task is not None:
runner: RunnerOption = "auto"
convert: ConvertOption = "auto"
msg_prefix = ("The 'task' option has been deprecated and will be "
"removed in v0.13.0 or v1.0, whichever comes first.")
msg_hint = "Please remove this option."
is_generative_task = self.task in _RUNNER_TASKS["generate"]
is_pooling_task = self.task in _RUNNER_TASKS["pooling"]
if is_generative_model and is_pooling_model:
if is_generative_task:
runner = "generate"
convert = "auto"
msg_hint = ("Please replace this option with `--runner "
"generate` to continue using this model "
"as a generative model.")
elif is_pooling_task:
runner = "pooling"
convert = "auto"
msg_hint = ("Please replace this option with `--runner "
"pooling` to continue using this model "
"as a pooling model.")
else: # task == "auto"
pass
elif is_generative_model or is_pooling_model:
if is_generative_task:
runner = "generate"
convert = "auto"
msg_hint = "Please remove this option"
elif is_pooling_task:
runner = "pooling"
convert = _task_to_convert(self.task)
msg_hint = ("Please replace this option with `--convert "
f"{convert}` to continue using this model "
"as a pooling model.")
else: # task == "auto"
pass
else: else:
self.task = "embed" raise AssertionError("The model should be a generative or "
elif self.task == "embedding": "pooling model when task is set to "
msg = ("The 'embedding' task has been renamed to 'embed', please " f"{self.task!r}.")
"use the new name. The old name will be removed in v1.0.")
self.runner = runner
self.convert = convert
msg = f"{msg_prefix} {msg_hint}"
warnings.warn(msg, DeprecationWarning, stacklevel=2) warnings.warn(msg, DeprecationWarning, stacklevel=2)
self.task = "embed" self.runner_type = self._get_runner_type(architectures, self.runner)
self.convert_type = self._get_convert_type(architectures,
self.runner_type,
self.convert)
if self.runner_type == "generate" and not is_generative_model:
generate_converts = _RUNNER_CONVERTS["generate"]
if self.convert_type not in generate_converts:
# Currently we don't have any converters for generative models
raise ValueError(
"This model does not support `--runner generate`.")
if self.runner_type == "pooling" and not is_pooling_model:
pooling_converts = _RUNNER_CONVERTS["pooling"]
if self.convert_type not in pooling_converts:
convert_option = "<" + "|".join(pooling_converts) + ">"
raise ValueError(
"This model does not support `--runner pooling`. "
f"You can pass `--convert {convert_option} to adapt "
"it into a pooling model.")
model_info, arch = self.registry.inspect_model_cls(self.architectures) self.supported_tasks = self._get_supported_tasks(
architectures, self.runner_type, self.convert_type)
# Note: Initialize these attributes early because transformers fallback
# may fail to load dynamic modules in child processes
model_info, arch = registry.inspect_model_cls(architectures, self)
self._model_info = model_info self._model_info = model_info
self._architecture = arch self._architecture = arch
logger.info("Resolved architecture: %s", arch)
all_supported_tasks = self._get_supported_tasks(self.task)
logger.debug("Tasks supported by runner type: %s", all_supported_tasks)
supported_runner_types = self._get_supported_runner_types(
all_supported_tasks)
runner_type = self._resolve_runner(self.runner, self.task,
supported_runner_types,
all_supported_tasks)
logger.debug("Selected runner type: %s", runner_type)
# For pooling models, self.task is used to indicate the
# user-selected task
if runner_type == "pooling" and self.task == "auto":
selected_task = all_supported_tasks[runner_type][-1]
assert selected_task != "encode"
self.task = selected_task
self.supported_runner_types = supported_runner_types
self.runner_type = runner_type
self.supported_tasks = all_supported_tasks[runner_type]
if self.runner_type in ("draft",
"generate") and self.task != "transcription":
self.truncation_side = "left"
else:
self.truncation_side = "right"
self.pooler_config = self._init_pooler_config() self.pooler_config = self._init_pooler_config()
...@@ -652,16 +763,10 @@ class ModelConfig: ...@@ -652,16 +763,10 @@ class ModelConfig:
self.original_max_model_len = self.max_model_len self.original_max_model_len = self.max_model_len
self.max_model_len = self.get_and_verify_max_len(self.max_model_len) self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
self.multimodal_config = self._init_multimodal_config() self.multimodal_config = self._init_multimodal_config()
self.model_supports_multimodal_raw_input = (
self.registry.supports_multimodal_raw_input(self.architectures))
if not self.skip_tokenizer_init: if not self.skip_tokenizer_init:
self._verify_tokenizer_mode() self._verify_tokenizer_mode()
self.is_attention_free = self._init_attention_free()
self.is_hybrid = self._init_is_hybrid()
self.has_noops = self._init_has_noops()
self.has_inner_state = self._init_has_inner_state()
if (not current_platform.is_neuron() and self.override_neuron_config): if (not current_platform.is_neuron() and self.override_neuron_config):
raise ValueError( raise ValueError(
"`override_neuron_config` is only supported on Neuron.") "`override_neuron_config` is only supported on Neuron.")
...@@ -702,30 +807,13 @@ class ModelConfig: ...@@ -702,30 +807,13 @@ class ModelConfig:
@property @property
def architectures(self) -> list[str]: def architectures(self) -> list[str]:
# architectures in the model config. return getattr(self.hf_config, "architectures", [])
architectures = getattr(self.hf_config, "architectures", [])
# The registry assumes that it can always inspect the vLLM model class
# for a given architecture. This assumption breaks down for the
# Transformers backend, which may use a different class depending on
# the model type. To work around this, we add the correct Transformers
# backend class to the architectures list. We must do this here because
# we need access to the `hf_config` to determine the backend class.
transformers_backend_cls = self._get_transformers_backend_cls()
if (self.model_impl != ModelImpl.VLLM.value
and all(arch != transformers_backend_cls
for arch in architectures)):
architectures.append(transformers_backend_cls)
return architectures
@property @property
def architecture(self) -> str: def architecture(self) -> str:
# The architecture vllm actually used. """The architecture vllm actually used."""
return self._architecture return self._architecture
@property
def model_info(self):
return self._model_info
def maybe_pull_model_tokenizer_for_s3(self, model: str, def maybe_pull_model_tokenizer_for_s3(self, model: str,
tokenizer: str) -> None: tokenizer: str) -> None:
"""Pull model/tokenizer from S3 to temporary directory when needed. """Pull model/tokenizer from S3 to temporary directory when needed.
...@@ -763,7 +851,7 @@ class ModelConfig: ...@@ -763,7 +851,7 @@ class ModelConfig:
self.tokenizer = s3_tokenizer.dir self.tokenizer = s3_tokenizer.dir
def _init_multimodal_config(self) -> Optional["MultiModalConfig"]: def _init_multimodal_config(self) -> Optional["MultiModalConfig"]:
if self.registry.is_multimodal_model(self.architectures): if self.registry.is_multimodal_model(self.architectures, self):
return MultiModalConfig( return MultiModalConfig(
limit_per_prompt=self.limit_mm_per_prompt, limit_per_prompt=self.limit_mm_per_prompt,
media_io_kwargs=self.media_io_kwargs, media_io_kwargs=self.media_io_kwargs,
...@@ -819,19 +907,6 @@ class ModelConfig: ...@@ -819,19 +907,6 @@ class ModelConfig:
return None return None
def _init_attention_free(self) -> bool:
return self.registry.is_attention_free_model(self.architectures)
def _init_is_hybrid(self) -> bool:
return self.registry.is_hybrid_model(self.architectures)
def _init_has_noops(self) -> bool:
architectures = getattr(self.hf_config, "architectures", [])
return self.registry.is_noops_model(architectures)
def _init_has_inner_state(self) -> bool:
return self.registry.model_has_inner_state(self.architectures)
def _verify_tokenizer_mode(self) -> None: def _verify_tokenizer_mode(self) -> None:
tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower()) tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower())
if tokenizer_mode not in get_args(TokenizerMode): if tokenizer_mode not in get_args(TokenizerMode):
...@@ -840,155 +915,168 @@ class ModelConfig: ...@@ -840,155 +915,168 @@ class ModelConfig:
f"one of {get_args(TokenizerMode)}.") f"one of {get_args(TokenizerMode)}.")
self.tokenizer_mode = tokenizer_mode self.tokenizer_mode = tokenizer_mode
def _is_classify_task(self, architectures: list[str]): def _get_default_runner_type(
self,
architectures: list[str],
) -> RunnerType:
registry = self.registry
# Some Sentence Transformers models use *ForCausalLM archs
if get_pooling_config(self.model, self.revision):
return "pooling"
for arch in architectures: for arch in architectures:
if arch.endswith("ForSequenceClassification"): if arch in registry.get_supported_archs():
return True if registry.is_pooling_model(architectures, self):
return self.registry.is_cross_encoder_model(architectures) return "pooling"
if registry.is_text_generation_model(architectures, self):
return "generate"
match = try_match_architecture_defaults(arch)
if match:
_, (runner_type, _) = match
return runner_type
return "generate"
def _get_preferred_pooling_task( def _get_runner_type(
self, self,
architectures: list[str], architectures: list[str],
) -> _ResolvedTask: runner: RunnerOption,
model_id = self.model ) -> RunnerType:
if get_pooling_config(model_id, self.revision): if runner != "auto":
return runner
runner_type = self._get_default_runner_type(architectures)
logger.info(
"Resolved `--runner auto` to `--runner %s`. "
"Pass the value explicitly to silence this message.", runner_type)
return runner_type
def _get_default_convert_type(
self,
architectures: list[str],
runner_type: RunnerType,
) -> ConvertType:
registry = self.registry
for arch in architectures:
if arch in registry.get_supported_archs():
if (runner_type == "generate"
and registry.is_text_generation_model(
architectures, self)):
return "none"
if (runner_type == "pooling"
and registry.is_pooling_model(architectures, self)):
return "none"
match = try_match_architecture_defaults(arch,
runner_type=runner_type)
if match:
_, (_, convert_type) = match
return convert_type
# This is to handle Sentence Transformers models that use *ForCausalLM
# and also multi-modal pooling models which are not defined as
# Sentence Transformers models
if runner_type == "pooling":
return "embed" return "embed"
if self.registry.is_transcription_model(architectures):
return "transcription"
suffix_to_preferred_task: list[tuple[str, _ResolvedTask]] = [ return "none"
# Other models follow this pattern
("EmbeddingModel", "embed"), def _get_convert_type(
("RewardModel", "reward"), self,
] architectures: list[str],
runner_type: RunnerType,
convert: ConvertOption,
) -> ConvertType:
if convert != "auto":
return convert
for suffix, pref_task in suffix_to_preferred_task: convert_type = self._get_default_convert_type(architectures,
if self.architecture.endswith(suffix): runner_type)
return pref_task
return "embed" logger.info(
"Resolved `--convert auto` to `--convert %s`. "
"Pass the value explicitly to silence this message.", convert_type)
return convert_type
def _get_supported_generation_tasks( def _get_supported_generation_tasks(
self, self,
task_option: TaskOption, architectures: list[str],
convert_type: ConvertType,
) -> list[_ResolvedTask]: ) -> list[_ResolvedTask]:
registry = self.registry registry = self.registry
architectures = self.architectures
if registry.is_transcription_only_model(architectures): if registry.is_transcription_only_model(architectures, self):
return ["transcription"] return ["transcription"]
# TODO: Use get_supported_generation_tasks once V0 is removed
supported_tasks = list[_ResolvedTask]() supported_tasks = list[_ResolvedTask]()
if registry.is_text_generation_model(architectures): if (registry.is_text_generation_model(architectures, self)
or convert_type in _RUNNER_CONVERTS["generate"]):
supported_tasks.append("generate") supported_tasks.append("generate")
if registry.is_transcription_model(architectures): if registry.is_transcription_model(architectures, self):
supported_tasks.append("transcription") supported_tasks.append("transcription")
return supported_tasks return supported_tasks
def _get_default_pooling_task(
self,
architectures: list[str],
) -> Literal["embed", "classify", "reward"]:
if self.registry.is_cross_encoder_model(architectures, self):
return "classify"
for arch in architectures:
match = try_match_architecture_defaults(arch,
runner_type="pooling")
if match:
_, (_, convert_type) = match
assert convert_type != "none"
return convert_type
return "embed"
def _get_supported_pooling_tasks( def _get_supported_pooling_tasks(
self, self,
task_option: TaskOption, architectures: list[str],
convert_type: ConvertType,
) -> list[_ResolvedTask]: ) -> list[_ResolvedTask]:
registry = self.registry registry = self.registry
architectures = self.architectures
# TODO: Use get_supported_pooling_tasks once V0 is removed
supported_tasks = list[_ResolvedTask]() supported_tasks = list[_ResolvedTask]()
if registry.is_pooling_model(architectures): if (registry.is_pooling_model(architectures, self)
or convert_type in _RUNNER_CONVERTS["pooling"]):
supported_tasks.append("encode") supported_tasks.append("encode")
# For now, users must specify the task (other than "pooling") extra_task = (self._get_default_pooling_task(architectures)
# to use for pooling models if convert_type == "none" else convert_type)
if task_option == "auto": supported_tasks.append(extra_task)
preferred_task = self._get_preferred_pooling_task(
architectures)
supported_tasks.append(preferred_task)
elif task_option in _RUNNER_TASKS["pooling"]:
supported_tasks.append(cast(_ResolvedTask, task_option))
return supported_tasks return supported_tasks
def _get_supported_tasks( def _get_supported_tasks(
self, self,
task_option: TaskOption, architectures: list[str],
) -> dict[RunnerType, list[_ResolvedTask]]: runner_type: RunnerType,
if self._is_classify_task(self.architectures): convert_type: ConvertType,
return {"generate": [], "pooling": ["classify"], "draft": []} ) -> list[_ResolvedTask]:
else: if runner_type == "generate":
return { return self._get_supported_generation_tasks(
"generate": self._get_supported_generation_tasks(task_option), architectures, convert_type)
"pooling": self._get_supported_pooling_tasks(task_option), if runner_type == "pooling":
"draft": ["draft"] return self._get_supported_pooling_tasks(architectures,
} convert_type)
if runner_type == "draft":
def _get_supported_runner_types( return ["draft"]
self,
supported_tasks: dict[RunnerType, list[_ResolvedTask]],
) -> set[RunnerType]:
return {
runner
for runner, runner_tasks in supported_tasks.items()
if len(runner_tasks) > 0
}
def _resolve_runner(
self,
runner_option: RunnerOption,
task_option: TaskOption,
supported_runner_types: set[RunnerType],
supported_tasks: dict[RunnerType, list[_ResolvedTask]],
) -> RunnerType:
if not supported_runner_types:
raise ValueError("This model does not support any model runners!")
if runner_option != "auto":
if runner_option not in supported_runner_types:
raise ValueError(
f"This model does not support runner={runner_option!r}. "
f"Available runners: {supported_runner_types}")
return runner_option
if task_option != "auto":
for runner, runner_tasks in supported_tasks.items():
if task_option in runner_tasks:
return runner
else:
task_runner: RunnerType = next(
runner for runner, tasks in _RUNNER_TASKS.items()
if task_option in tasks)
raise ValueError(
f"This model does not support task={task_option!r}. "
f"Available tasks for runner={task_runner!r}: "
f"{supported_tasks[task_runner]}")
if "classify" in supported_tasks.get("pooling", []):
# When multiple pooling tasks are present, default to
# pooling (eg cross-encoder) for non-standard architectures.
return "pooling"
suffix_to_preferred_runner: list[tuple[str, RunnerType]] = [
("ForCausalLM", "generate"),
("ForConditionalGeneration", "generate"),
("ChatModel", "generate"),
("LMHeadModel", "generate"),
("EmbeddingModel", "pooling"),
("RewardModel", "pooling"),
]
for suffix, pref_runner in suffix_to_preferred_runner:
if self.architecture.endswith(
suffix) and pref_runner in supported_runner_types:
return pref_runner
if "generate" in supported_runner_types:
return "generate"
if "pooling" in supported_runner_types:
return "pooling"
raise AssertionError("This line should not be reached") assert_never(runner_type)
def _parse_quant_hf_config(self): def _parse_quant_hf_config(self):
quant_cfg = getattr(self.hf_config, "quantization_config", None) quant_cfg = getattr(self.hf_config, "quantization_config", None)
...@@ -1216,7 +1304,8 @@ class ModelConfig: ...@@ -1216,7 +1304,8 @@ class ModelConfig:
pipeline_parallel_size = parallel_config.pipeline_parallel_size pipeline_parallel_size = parallel_config.pipeline_parallel_size
if pipeline_parallel_size > 1: if pipeline_parallel_size > 1:
if not self.registry.is_pp_supported_model(self.architectures): if not self.registry.is_pp_supported_model(self.architectures,
self):
raise NotImplementedError( raise NotImplementedError(
"Pipeline parallelism is not supported for this model. " "Pipeline parallelism is not supported for this model. "
"Supported models implement the `SupportsPP` interface.") "Supported models implement the `SupportsPP` interface.")
...@@ -1558,16 +1647,40 @@ class ModelConfig: ...@@ -1558,16 +1647,40 @@ class ModelConfig:
@property @property
def is_cross_encoder(self) -> bool: def is_cross_encoder(self) -> bool:
return self.task == "classify" return (self._model_info.supports_cross_encoding
or self.convert_type == "classify")
@property @property
def use_mla(self) -> bool: def is_pp_supported(self) -> bool:
return self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE return self._model_info.supports_pp
@property
def is_multimodal_raw_input_supported(self) -> bool:
return self._model_info.supports_multimodal_raw_input
@property
def is_attention_free(self) -> bool:
return self._model_info.is_attention_free
@property
def is_hybrid(self) -> bool:
return self._model_info.is_hybrid
@property
def has_noops(self) -> bool:
return self._model_info.has_noops
@property
def has_inner_state(self):
return self._model_info.has_inner_state
@property @property
def is_v1_compatible(self) -> bool: def is_v1_compatible(self) -> bool:
architectures = getattr(self.hf_config, "architectures", []) return not self._model_info.supports_v0_only
return me_models.ModelRegistry.is_v1_compatible(architectures)
@property
def use_mla(self) -> bool:
return self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE
@property @property
def is_matryoshka(self) -> bool: def is_matryoshka(self) -> bool:
...@@ -4769,7 +4882,10 @@ class VllmConfig: ...@@ -4769,7 +4882,10 @@ class VllmConfig:
self.scheduler_config.max_model_len = max_model_len self.scheduler_config.max_model_len = max_model_len
def try_verify_and_update_config(self): def try_verify_and_update_config(self):
architecture = getattr(self.model_config, "architecture", None) if self.model_config is None:
return
architecture = self.model_config.architecture
if architecture is None: if architecture is None:
return return
...@@ -4782,7 +4898,7 @@ class VllmConfig: ...@@ -4782,7 +4898,7 @@ class VllmConfig:
if self.model_config.is_hybrid: if self.model_config.is_hybrid:
HybridAttentionMambaModelConfig.verify_and_update_config(self) HybridAttentionMambaModelConfig.verify_and_update_config(self)
if self.model_config.task == "classify": if self.model_config.convert_type == "classify":
# Maybe convert ForCausalLM into ForSequenceClassification model. # Maybe convert ForCausalLM into ForSequenceClassification model.
from vllm.model_executor.models.adapters import ( from vllm.model_executor.models.adapters import (
SequenceClassificationConfig) SequenceClassificationConfig)
......
...@@ -22,14 +22,15 @@ from typing_extensions import TypeIs ...@@ -22,14 +22,15 @@ from typing_extensions import TypeIs
import vllm.envs as envs import vllm.envs as envs
from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
ConfigFormat, ConfigType, DecodingConfig, ConfigFormat, ConfigType, ConvertOption,
DetailedTraceModules, Device, DeviceConfig, DecodingConfig, DetailedTraceModules, Device,
DistributedExecutorBackend, GuidedDecodingBackend, DeviceConfig, DistributedExecutorBackend,
GuidedDecodingBackendV1, HfOverrides, KVEventsConfig, GuidedDecodingBackend, GuidedDecodingBackendV1,
KVTransferConfig, LoadConfig, LogprobsMode, HfOverrides, KVEventsConfig, KVTransferConfig,
LoRAConfig, ModelConfig, ModelDType, ModelImpl, LoadConfig, LogprobsMode, LoRAConfig, ModelConfig,
MultiModalConfig, ObservabilityConfig, ParallelConfig, ModelDType, ModelImpl, MultiModalConfig,
PoolerConfig, PrefixCachingHashAlgo, SchedulerConfig, ObservabilityConfig, ParallelConfig, PoolerConfig,
PrefixCachingHashAlgo, RunnerOption, SchedulerConfig,
SchedulerPolicy, SpeculativeConfig, TaskOption, SchedulerPolicy, SpeculativeConfig, TaskOption,
TokenizerMode, VllmConfig, get_attr_docs, get_field) TokenizerMode, VllmConfig, get_attr_docs, get_field)
from vllm.logger import init_logger from vllm.logger import init_logger
...@@ -270,7 +271,9 @@ class EngineArgs: ...@@ -270,7 +271,9 @@ class EngineArgs:
str, List[str]]] = ModelConfig.served_model_name str, List[str]]] = ModelConfig.served_model_name
tokenizer: Optional[str] = ModelConfig.tokenizer tokenizer: Optional[str] = ModelConfig.tokenizer
hf_config_path: Optional[str] = ModelConfig.hf_config_path hf_config_path: Optional[str] = ModelConfig.hf_config_path
task: TaskOption = ModelConfig.task runner: RunnerOption = ModelConfig.runner
convert: ConvertOption = ModelConfig.convert
task: Optional[TaskOption] = ModelConfig.task
skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode
...@@ -461,7 +464,11 @@ class EngineArgs: ...@@ -461,7 +464,11 @@ class EngineArgs:
) )
if not ('serve' in sys.argv[1:] and '--help' in sys.argv[1:]): if not ('serve' in sys.argv[1:] and '--help' in sys.argv[1:]):
model_group.add_argument("--model", **model_kwargs["model"]) model_group.add_argument("--model", **model_kwargs["model"])
model_group.add_argument("--task", **model_kwargs["task"]) model_group.add_argument("--runner", **model_kwargs["runner"])
model_group.add_argument("--convert", **model_kwargs["convert"])
model_group.add_argument("--task",
**model_kwargs["task"],
deprecated=True)
model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"]) model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
model_group.add_argument("--tokenizer-mode", model_group.add_argument("--tokenizer-mode",
**model_kwargs["tokenizer_mode"]) **model_kwargs["tokenizer_mode"])
...@@ -870,6 +877,8 @@ class EngineArgs: ...@@ -870,6 +877,8 @@ class EngineArgs:
return ModelConfig( return ModelConfig(
model=self.model, model=self.model,
hf_config_path=self.hf_config_path, hf_config_path=self.hf_config_path,
runner=self.runner,
convert=self.convert,
task=self.task, task=self.task,
tokenizer=self.tokenizer, tokenizer=self.tokenizer,
tokenizer_mode=self.tokenizer_mode, tokenizer_mode=self.tokenizer_mode,
......
...@@ -20,8 +20,8 @@ from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, ...@@ -20,8 +20,8 @@ from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
create_sort_beams_key_function) create_sort_beams_key_function)
from vllm.config import (CompilationConfig, ModelDType, TokenizerMode, from vllm.config import (CompilationConfig, ModelDType, TokenizerMode,
is_init_field) is_init_field)
from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig, from vllm.engine.arg_utils import (ConvertOption, EngineArgs, HfOverrides,
TaskOption) PoolerConfig, RunnerOption)
from vllm.engine.llm_engine import LLMEngine from vllm.engine.llm_engine import LLMEngine
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
ChatTemplateContentFormatOption, ChatTemplateContentFormatOption,
...@@ -170,7 +170,8 @@ class LLM: ...@@ -170,7 +170,8 @@ class LLM:
self, self,
model: str, model: str,
*, *,
task: TaskOption = "auto", runner: RunnerOption = "auto",
convert: ConvertOption = "auto",
tokenizer: Optional[str] = None, tokenizer: Optional[str] = None,
tokenizer_mode: TokenizerMode = "auto", tokenizer_mode: TokenizerMode = "auto",
skip_tokenizer_init: bool = False, skip_tokenizer_init: bool = False,
...@@ -244,7 +245,8 @@ class LLM: ...@@ -244,7 +245,8 @@ class LLM:
engine_args = EngineArgs( engine_args = EngineArgs(
model=model, model=model,
task=task, runner=runner,
convert=convert,
tokenizer=tokenizer, tokenizer=tokenizer,
tokenizer_mode=tokenizer_mode, tokenizer_mode=tokenizer_mode,
skip_tokenizer_init=skip_tokenizer_init, skip_tokenizer_init=skip_tokenizer_init,
...@@ -459,18 +461,10 @@ class LLM: ...@@ -459,18 +461,10 @@ class LLM:
model_config = self.llm_engine.model_config model_config = self.llm_engine.model_config
runner_type = model_config.runner_type runner_type = model_config.runner_type
if runner_type != "generate": if runner_type != "generate":
messages = [ raise ValueError(
"LLM.generate() is only supported for generative models." "LLM.generate() is only supported for generative models. "
] "Try passing `--runner generate` to use the model as a "
"generative model.")
if "generate" in model_config.supported_runner_types:
messages.append(
"Your model supports the 'generate' runner, but is "
f"currently initialized for the '{runner_type}' runner. "
"Please initialize vLLM using `--task generate` or "
"`--task transcription`.")
raise ValueError(" ".join(messages))
if prompt_token_ids is not None: if prompt_token_ids is not None:
parsed_prompts = self._convert_v1_inputs( parsed_prompts = self._convert_v1_inputs(
...@@ -497,7 +491,8 @@ class LLM: ...@@ -497,7 +491,8 @@ class LLM:
truncate_prompt_tokens = None truncate_prompt_tokens = None
if isinstance(sampling_params, SamplingParams): if isinstance(sampling_params, SamplingParams):
truncate_prompt_tokens = sampling_params.truncate_prompt_tokens truncate_prompt_tokens = sampling_params.truncate_prompt_tokens
_validate_truncation_size(self.llm_engine.model_config.max_model_len,
_validate_truncation_size(model_config.max_model_len,
truncate_prompt_tokens, tokenization_kwargs) truncate_prompt_tokens, tokenization_kwargs)
# Add any modality specific loras to the corresponding prompts # Add any modality specific loras to the corresponding prompts
...@@ -1100,16 +1095,10 @@ class LLM: ...@@ -1100,16 +1095,10 @@ class LLM:
model_config = self.llm_engine.model_config model_config = self.llm_engine.model_config
runner_type = model_config.runner_type runner_type = model_config.runner_type
if runner_type != "pooling": if runner_type != "pooling":
messages = ["LLM.encode() is only supported for pooling models."] raise ValueError(
"LLM.encode() is only supported for pooling models. "
if "pooling" in model_config.supported_runner_types: "Try passing `--runner pooling` to use the model as a "
messages.append( "pooling model.")
"Your model supports the 'pooling' runner, but is "
f"currently initialized for the '{runner_type}' runner. "
"Please initialize vLLM using `--task embed`, "
"`--task classify`, `--task score` etc.")
raise ValueError(" ".join(messages))
if prompt_token_ids is not None: if prompt_token_ids is not None:
parsed_prompts = self._convert_v1_inputs( parsed_prompts = self._convert_v1_inputs(
...@@ -1183,8 +1172,9 @@ class LLM: ...@@ -1183,8 +1172,9 @@ class LLM:
embedding vectors in the same order as the input prompts. embedding vectors in the same order as the input prompts.
""" """
if "embed" not in self.supported_tasks: if "embed" not in self.supported_tasks:
raise ValueError("Embedding API is not supported by this model. " raise ValueError(
"Please set `--task embed`.") "Embedding API is not supported by this model. "
"Try converting the model using `--convert embed`.")
items = self.encode( items = self.encode(
prompts, prompts,
...@@ -1229,7 +1219,7 @@ class LLM: ...@@ -1229,7 +1219,7 @@ class LLM:
if "classify" not in self.supported_tasks: if "classify" not in self.supported_tasks:
raise ValueError( raise ValueError(
"Classification API is not supported by this model. " "Classification API is not supported by this model. "
"Please set `--task classify`.") "Try converting the model using `--convert classify`.")
items = self.encode( items = self.encode(
prompts, prompts,
...@@ -1283,27 +1273,26 @@ class LLM: ...@@ -1283,27 +1273,26 @@ class LLM:
use_tqdm: Union[bool, Callable[..., tqdm]] = True, use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
) -> list[ScoringRequestOutput]: ) -> list[ScoringRequestOutput]:
model_config = self.llm_engine.model_config
if isinstance(tokenizer, MistralTokenizer): if isinstance(tokenizer, MistralTokenizer):
raise ValueError( raise ValueError(
"Score API is only enabled for `--task embed or score`") "Score API is not supported for Mistral tokenizer")
if len(data_1) == 1: if len(data_1) == 1:
data_1 = data_1 * len(data_2) data_1 = data_1 * len(data_2)
pooling_params = PoolingParams(task="score") pooling_params = PoolingParams(task="score")
tokenization_kwargs: dict[str, Any] = {} tokenization_kwargs: dict[str, Any] = {}
_validate_truncation_size(self.llm_engine.model_config.max_model_len,
_validate_truncation_size(model_config.max_model_len,
truncate_prompt_tokens, tokenization_kwargs) truncate_prompt_tokens, tokenization_kwargs)
parsed_prompts = [] parsed_prompts = []
input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)] input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
if self.llm_engine.model_config.is_multimodal_model: if model_config.is_multimodal_model:
model_config = self.llm_engine.model_config
for q, d in input_pairs: for q, d in input_pairs:
_, engine_prompt = get_score_prompt( _, engine_prompt = get_score_prompt(
model_config=model_config, model_config=model_config,
...@@ -1314,11 +1303,9 @@ class LLM: ...@@ -1314,11 +1303,9 @@ class LLM:
) )
parsed_prompts.append(engine_prompt) parsed_prompts.append(engine_prompt)
else: else:
for q, t in input_pairs: for q, t in input_pairs:
if self.llm_engine.model_config.use_pad_token: if model_config.use_pad_token:
# cross_encoder models defaults to using pad_token. # cross_encoder models defaults to using pad_token.
prompt_inputs = tokenizer( prompt_inputs = tokenizer(
text=q, # type: ignore[arg-type] text=q, # type: ignore[arg-type]
...@@ -1396,23 +1383,18 @@ class LLM: ...@@ -1396,23 +1383,18 @@ class LLM:
model_config = self.llm_engine.model_config model_config = self.llm_engine.model_config
runner_type = model_config.runner_type runner_type = model_config.runner_type
if runner_type != "pooling": if runner_type != "pooling":
messages = ["LLM.score() is only supported for pooling models."] raise ValueError(
"LLM.score() is only supported for pooling models. "
if "pooling" in model_config.supported_runner_types: "Try passing `--runner pooling` to use the model as a "
messages.append( "pooling model.")
"Your model supports the 'pooling' runner, but is "
f"currently initialized for the '{runner_type}' runner. "
"Please initialize vLLM using `--task embed`, "
"`--task classify`, `--task score` etc.")
raise ValueError(" ".join(messages))
supported_tasks = self.supported_tasks supported_tasks = self.supported_tasks
if all(t not in supported_tasks for t in ("embed", "classify")): if all(t not in supported_tasks for t in ("embed", "classify")):
raise ValueError("Score API is not supported by this model. " raise ValueError("Score API is not supported by this model. "
"Please set `--task embed` or `--task classify`.") "Try converting the model using "
"`--convert embed` or `--convert classify`.")
if (model_config.task == "classify" if (model_config.is_cross_encoder
and getattr(model_config.hf_config, "num_labels", 0) != 1): and getattr(model_config.hf_config, "num_labels", 0) != 1):
raise ValueError("Score API is only enabled for num_labels == 1.") raise ValueError("Score API is only enabled for num_labels == 1.")
...@@ -1421,15 +1403,14 @@ class LLM: ...@@ -1421,15 +1403,14 @@ class LLM:
# lists of tokens to the `text` and `text_pair` kwargs # lists of tokens to the `text` and `text_pair` kwargs
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
if not self.llm_engine.model_config.is_multimodal_model: if not model_config.is_multimodal_model:
def check_data_type(data: Union[SingletonPrompt, def check_data_type(data: Union[SingletonPrompt,
Sequence[SingletonPrompt], Sequence[SingletonPrompt],
ScoreMultiModalParam]): ScoreMultiModalParam]):
if isinstance(data, dict) and "content" in data: if isinstance(data, dict) and "content" in data:
raise ValueError( raise ValueError("ScoreMultiModalParam is not supported "
f"ScoreMultiModalParam is not supported for {self.llm_engine.model_config.architecture}", # noqa: E501 f"for {model_config.architecture}")
)
check_data_type(data_1) check_data_type(data_1)
check_data_type(data_2) check_data_type(data_2)
...@@ -1471,7 +1452,7 @@ class LLM: ...@@ -1471,7 +1452,7 @@ class LLM:
_validate_score_input_lens(data_1, data_2) # type: ignore[arg-type] _validate_score_input_lens(data_1, data_2) # type: ignore[arg-type]
if self.llm_engine.model_config.is_cross_encoder: if model_config.is_cross_encoder:
return self._cross_encoding_score( return self._cross_encoding_score(
tokenizer, tokenizer,
data_1, # type: ignore[arg-type] data_1, # type: ignore[arg-type]
......
...@@ -1734,7 +1734,6 @@ async def init_app_state( ...@@ -1734,7 +1734,6 @@ async def init_app_state(
state.openai_serving_models, state.openai_serving_models,
request_logger=request_logger, request_logger=request_logger,
) if "transcription" in supported_tasks else None ) if "transcription" in supported_tasks else None
state.task = model_config.task
state.enable_server_load_tracking = args.enable_server_load_tracking state.enable_server_load_tracking = args.enable_server_load_tracking
state.server_load_metrics = 0 state.server_load_metrics = 0
......
...@@ -9,9 +9,8 @@ from dataclasses import dataclass, field ...@@ -9,9 +9,8 @@ from dataclasses import dataclass, field
from typing import Optional from typing import Optional
import torch import torch
import transformers
from torch import nn from torch import nn
from transformers.dynamic_module_utils import get_class_from_dynamic_module from typing_extensions import assert_never
from vllm.attention import Attention from vllm.attention import Attention
from vllm.config import (ModelConfig, ModelImpl, VllmConfig, from vllm.config import (ModelConfig, ModelImpl, VllmConfig,
...@@ -20,13 +19,10 @@ from vllm.logger import init_logger ...@@ -20,13 +19,10 @@ from vllm.logger import init_logger
from vllm.model_executor.layers.linear import QKVCrossParallelLinear from vllm.model_executor.layers.linear import QKVCrossParallelLinear
from vllm.model_executor.layers.quantization.base_config import ( from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase) QuantizationConfig, QuantizeMethodBase)
from vllm.model_executor.models import ModelRegistry
from vllm.model_executor.models.adapters import (as_embedding_model, from vllm.model_executor.models.adapters import (as_embedding_model,
as_reward_model, as_reward_model,
as_seq_cls_model) as_seq_cls_model)
from vllm.model_executor.models.interfaces import SupportsQuant from vllm.model_executor.models.interfaces import SupportsQuant
from vllm.model_executor.models.registry import (_PREVIOUSLY_SUPPORTED_MODELS,
_TRANSFORMERS_BACKEND_MODELS)
from vllm.utils import is_pin_memory_available from vllm.utils import is_pin_memory_available
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -169,61 +165,6 @@ def device_loading_context(module: torch.nn.Module, ...@@ -169,61 +165,6 @@ def device_loading_context(module: torch.nn.Module,
# New parameters or parameters already on target device are untouched # New parameters or parameters already on target device are untouched
def resolve_transformers_arch(model_config: ModelConfig,
architectures: list[str]):
if model_config.model_impl == ModelImpl.VLLM:
raise ValueError(
"Attempting to resolve architecture from the Transformers library "
"but the model implementation is set to vLLM. This should never "
"happen.")
for i, arch in enumerate(architectures):
if arch in _TRANSFORMERS_BACKEND_MODELS:
continue
if model_config.model_impl == ModelImpl.AUTO:
logger.warning(
"%s has no vLLM implementation, falling back to Transformers "
"implementation. Some features may not be supported and "
"performance may not be optimal.", arch)
auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
None) or dict()
# Make sure that config class is always initialized before model class,
# otherwise the model class won't be able to access the config class,
# the expected auto_map should have correct order like:
# "auto_map": {
# "AutoConfig": "<your-repo-name>--<config-name>",
# "AutoModel": "<your-repo-name>--<config-name>",
# "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
# },
auto_modules = {
name:
get_class_from_dynamic_module(module,
model_config.model,
revision=model_config.revision)
for name, module in sorted(auto_map.items(), key=lambda x: x[0])
}
model_module = getattr(transformers, arch, None)
if model_module is None:
if "AutoModel" not in auto_map:
raise ValueError(
f"Cannot find model module. '{arch}' is not a registered "
"model in the Transformers library (only relevant if the "
"model is meant to be in Transformers) and 'AutoModel' is "
"not present in the model config's 'auto_map' (relevant "
"if the model is custom).")
model_module = auto_modules["AutoModel"]
if not model_module.is_backend_compatible():
raise ValueError(
f"The Transformers implementation of '{arch}' is not "
"compatible with vLLM.")
architectures[i] = model_config._get_transformers_backend_cls()
return architectures
def get_model_architecture( def get_model_architecture(
model_config: ModelConfig) -> tuple[type[nn.Module], str]: model_config: ModelConfig) -> tuple[type[nn.Module], str]:
architectures = getattr(model_config.hf_config, "architectures", []) architectures = getattr(model_config.hf_config, "architectures", [])
...@@ -239,56 +180,38 @@ def get_model_architecture( ...@@ -239,56 +180,38 @@ def get_model_architecture(
"bitsandbytes", "bitsandbytes",
] ]
vllm_supported_archs = ModelRegistry.get_supported_archs() if (model_config.quantization is not None
is_supported = lambda arch: (arch in vllm_supported_archs and arch not in and model_config.quantization not in mixtral_supported
_TRANSFORMERS_BACKEND_MODELS) and "MixtralForCausalLM" in architectures):
vllm_not_supported = not any(is_supported(arch) for arch in architectures) architectures = ["QuantMixtralForCausalLM"]
if vllm_not_supported:
# try automatic conversion in adapters.py
for arch in architectures:
if not arch.endswith("ForSequenceClassification"):
continue
assert model_config.task == "classify" model_cls, arch = model_config.registry.resolve_model_cls(
causal_lm_arch = arch.replace("ForSequenceClassification", architectures,
"ForCausalLM") model_config=model_config,
causal_lm_arch_vllm_supported = (causal_lm_arch )
in vllm_supported_archs)
if not causal_lm_arch_vllm_supported:
continue
architectures = [causal_lm_arch] if arch == model_config._get_transformers_backend_cls():
vllm_not_supported = False assert model_config.model_impl != ModelImpl.VLLM
break if model_config.model_impl == ModelImpl.AUTO:
logger.warning_once(
if any(arch in _PREVIOUSLY_SUPPORTED_MODELS for arch in architectures): "%s has no vLLM implementation, falling back to Transformers "
previous_version = _PREVIOUSLY_SUPPORTED_MODELS[architectures[0]] "implementation. Some features may not be supported and "
raise ValueError( "performance may not be optimal.", arch)
f"Model architecture {architectures[0]} was supported"
f" in vLLM until version {previous_version}, and is "
"not supported anymore. Please use an older version"
" of vLLM if you want to use this model architecture.")
if (model_config.model_impl == ModelImpl.TRANSFORMERS or
model_config.model_impl == ModelImpl.AUTO and vllm_not_supported):
architectures = resolve_transformers_arch(model_config, architectures)
logger.debug_once("Resolve transformers arch %s", str(architectures))
elif (model_config.quantization is not None
and model_config.quantization not in mixtral_supported
and "MixtralForCausalLM" in architectures):
architectures = ["QuantMixtralForCausalLM"]
model_cls, arch = ModelRegistry.resolve_model_cls(architectures) convert_type = model_config.convert_type
if model_config.task == "embed": if convert_type == "none":
logger.debug_once("Automatic conversion using `as_embedding_model`.") pass
elif convert_type == "embed":
logger.debug_once("Converting to embedding model.")
model_cls = as_embedding_model(model_cls) model_cls = as_embedding_model(model_cls)
elif model_config.task == "classify": elif convert_type == "classify":
logger.debug_once("Automatic conversion using `as_seq_cls_model`.") logger.debug_once("Converting to sequence classification model.")
model_cls = as_seq_cls_model(model_cls) model_cls = as_seq_cls_model(model_cls)
elif model_config.task == "reward": elif convert_type == "reward":
logger.debug_once("Automatic conversion using `as_reward_model`.") logger.debug_once("Converting to reward model.")
model_cls = as_reward_model(model_cls) model_cls = as_reward_model(model_cls)
else:
assert_never(convert_type)
return model_cls, arch return model_cls, arch
......
...@@ -253,8 +253,10 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): ...@@ -253,8 +253,10 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
dtype=kv_cache_dtype, dtype=kv_cache_dtype,
use_mla=model_config.use_mla).page_size_bytes use_mla=model_config.use_mla).page_size_bytes
model_cls = ModelRegistry.resolve_model_cls( model_cls, _ = ModelRegistry.resolve_model_cls(
model_config._model_info.architecture)[0] model_config.architecture,
model_config=model_config,
)
# get mamba page size # get mamba page size
mamba_page_size = MambaSpec( mamba_page_size = MambaSpec(
......
...@@ -12,19 +12,24 @@ import sys ...@@ -12,19 +12,24 @@ import sys
import tempfile import tempfile
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from collections.abc import Set from collections.abc import Set
from dataclasses import asdict, dataclass, field from dataclasses import dataclass, field
from functools import lru_cache from functools import lru_cache
from typing import Callable, Optional, TypeVar, Union from typing import Callable, Optional, TypeVar, Union
import torch.nn as nn import torch.nn as nn
import transformers
from vllm.config import (ModelConfig, ModelImpl, iter_architecture_defaults,
try_match_architecture_defaults)
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.transformers_utils.dynamic_module import (
try_get_class_from_dynamic_module)
from .interfaces import (has_inner_state, has_noops, is_attention_free, from .interfaces import (has_inner_state, has_noops, is_attention_free,
is_hybrid, supports_cross_encoding, is_hybrid, supports_cross_encoding,
supports_multimodal, supports_multimodal_raw_input, supports_multimodal, supports_multimodal_raw_input,
supports_pp, supports_transcription, supports_v0_only) supports_pp, supports_transcription, supports_v0_only)
from .interfaces_base import is_text_generation_model from .interfaces_base import is_pooling_model, is_text_generation_model
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -311,7 +316,7 @@ class _ModelInfo: ...@@ -311,7 +316,7 @@ class _ModelInfo:
return _ModelInfo( return _ModelInfo(
architecture=model.__name__, architecture=model.__name__,
is_text_generation_model=is_text_generation_model(model), is_text_generation_model=is_text_generation_model(model),
is_pooling_model=True, # Can convert any model into a pooling model is_pooling_model=is_pooling_model(model),
supports_cross_encoding=supports_cross_encoding(model), supports_cross_encoding=supports_cross_encoding(model),
supports_multimodal=supports_multimodal(model), supports_multimodal=supports_multimodal(model),
supports_multimodal_raw_input=supports_multimodal_raw_input(model), supports_multimodal_raw_input=supports_multimodal_raw_input(model),
...@@ -465,6 +470,16 @@ class _ModelRegistry: ...@@ -465,6 +470,16 @@ class _ModelRegistry:
f"Model architectures {architectures} failed " f"Model architectures {architectures} failed "
"to be inspected. Please check the logs for more details.") "to be inspected. Please check the logs for more details.")
for arch in architectures:
if arch in _PREVIOUSLY_SUPPORTED_MODELS:
previous_version = _PREVIOUSLY_SUPPORTED_MODELS[arch]
raise ValueError(
f"Model architecture {arch} was supported in vLLM until "
f"v{previous_version}, and is not supported anymore. "
"Please use an older version of vLLM if you want to "
"use this model architecture.")
raise ValueError( raise ValueError(
f"Model architectures {architectures} are not supported for now. " f"Model architectures {architectures} are not supported for now. "
f"Supported architectures: {all_supported_archs}") f"Supported architectures: {all_supported_archs}")
...@@ -477,174 +492,284 @@ class _ModelRegistry: ...@@ -477,174 +492,284 @@ class _ModelRegistry:
return _try_load_model_cls(model_arch, self.models[model_arch]) return _try_load_model_cls(model_arch, self.models[model_arch])
def _try_inspect_model_cls(self, model_arch: str) -> Optional[_ModelInfo]: def _try_inspect_model_cls(self, model_arch: str) -> Optional[_ModelInfo]:
if model_arch in self.models: if model_arch not in self.models:
return _try_inspect_model_cls(model_arch, self.models[model_arch]) return None
if model_arch.endswith("ForSequenceClassification"): return _try_inspect_model_cls(model_arch, self.models[model_arch])
causal_lm_arch = model_arch.replace("ForSequenceClassification",
"ForCausalLM") def _try_resolve_transformers(
if causal_lm_arch not in self.models: self,
architecture: str,
model_config: ModelConfig,
) -> Optional[str]:
if architecture in _TRANSFORMERS_BACKEND_MODELS:
return architecture
auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
None) or dict()
# Make sure that config class is always initialized before model class,
# otherwise the model class won't be able to access the config class,
# the expected auto_map should have correct order like:
# "auto_map": {
# "AutoConfig": "<your-repo-name>--<config-name>",
# "AutoModel": "<your-repo-name>--<config-name>",
# "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
# },
for prefix in ("AutoConfig", "AutoModel"):
for name, module in auto_map.items():
if name.startswith(prefix):
try_get_class_from_dynamic_module(
module,
model_config.model,
revision=model_config.revision,
warn_on_fail=False,
)
model_module = getattr(transformers, architecture, None)
if model_module is None:
for name, module in auto_map.items():
if name.startswith("AutoModel"):
model_module = try_get_class_from_dynamic_module(
module,
model_config.model,
revision=model_config.revision,
warn_on_fail=True,
)
if model_module is not None:
break
else:
if model_config.model_impl != ModelImpl.TRANSFORMERS:
return None
raise ValueError(
f"Cannot find model module. {architecture!r} is not a "
"registered model in the Transformers library (only "
"relevant if the model is meant to be in Transformers) "
"and 'AutoModel' is not present in the model config's "
"'auto_map' (relevant if the model is custom).")
if not model_module.is_backend_compatible():
if model_config.model_impl != ModelImpl.TRANSFORMERS:
return None return None
info = _try_inspect_model_cls(causal_lm_arch, raise ValueError(
self.models[causal_lm_arch]) f"The Transformers implementation of {architecture!r} "
"is not compatible with vLLM.")
info = _ModelInfo(**dict( return model_config._get_transformers_backend_cls()
asdict(info), **{
"architecture": model_arch,
"supports_cross_encoding": True
}))
return info
return None def _normalize_arch(
self,
architecture: str,
model_config: ModelConfig,
) -> str:
if architecture in self.models:
return architecture
# This may be called in order to resolve runner_type and convert_type
# in the first place, in which case we consider the default match
match = try_match_architecture_defaults(
architecture,
runner_type=getattr(model_config, "runner_type", None),
convert_type=getattr(model_config, "convert_type", None),
)
if match:
suffix, _ = match
# Get the name of the base model to convert
for repl_suffix, _ in iter_architecture_defaults():
base_arch = architecture.replace(suffix, repl_suffix)
if base_arch in self.models:
return base_arch
return architecture
def _normalize_archs( def _normalize_archs(
self, self,
architectures: Union[str, list[str]], architectures: list[str],
model_config: ModelConfig,
) -> list[str]: ) -> list[str]:
if isinstance(architectures, str):
architectures = [architectures]
if not architectures: if not architectures:
logger.warning("No model architectures are specified") logger.warning("No model architectures are specified")
# filter out support architectures return [
normalized_arch = list( self._normalize_arch(arch, model_config) for arch in architectures
filter(lambda model: model in self.models, architectures)) ]
# try automatic conversion in adapters.py
for arch in architectures:
if not arch.endswith("ForSequenceClassification"):
continue
causal_lm_arch = arch.replace("ForSequenceClassification",
"ForCausalLM")
if causal_lm_arch in self.models:
normalized_arch.append(arch)
# NOTE(Isotr0py): Be careful of architectures' order!
# Make sure Transformers backend architecture is at the end of the
# list, otherwise pooling models automatic conversion will fail!
for arch in normalized_arch:
if arch.startswith("TransformersFor"):
normalized_arch.remove(arch)
normalized_arch.append(arch)
return normalized_arch
def inspect_model_cls( def inspect_model_cls(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> tuple[_ModelInfo, str]: ) -> tuple[_ModelInfo, str]:
architectures = self._normalize_archs(architectures) if isinstance(architectures, str):
architectures = [architectures]
for arch in architectures: normalized_archs = self._normalize_archs(architectures, model_config)
model_info = self._try_inspect_model_cls(arch)
# Require transformers impl
if model_config.model_impl == ModelImpl.TRANSFORMERS:
arch = self._try_resolve_transformers(architectures[0],
model_config)
if arch is not None:
model_info = self._try_inspect_model_cls(arch)
if model_info is not None:
return (model_info, arch)
for arch, normalized_arch in zip(architectures, normalized_archs):
model_info = self._try_inspect_model_cls(normalized_arch)
if model_info is not None: if model_info is not None:
return (model_info, arch) return (model_info, arch)
# Fallback to transformers impl
if model_config.model_impl in (ModelImpl.AUTO, ModelImpl.TRANSFORMERS):
arch = self._try_resolve_transformers(architectures[0],
model_config)
if arch is not None:
model_info = self._try_inspect_model_cls(arch)
if model_info is not None:
return (model_info, arch)
return self._raise_for_unsupported(architectures) return self._raise_for_unsupported(architectures)
def resolve_model_cls( def resolve_model_cls(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> tuple[type[nn.Module], str]: ) -> tuple[type[nn.Module], str]:
architectures = self._normalize_archs(architectures) if isinstance(architectures, str):
architectures = [architectures]
for arch in architectures: normalized_archs = self._normalize_archs(architectures, model_config)
model_cls = self._try_load_model_cls(arch)
# Require transformers impl
if model_config.model_impl == ModelImpl.TRANSFORMERS:
arch = self._try_resolve_transformers(architectures[0],
model_config)
if arch is not None:
model_cls = self._try_load_model_cls(arch)
if model_cls is not None:
return (model_cls, arch)
for arch, normalized_arch in zip(architectures, normalized_archs):
model_cls = self._try_load_model_cls(normalized_arch)
if model_cls is not None: if model_cls is not None:
return (model_cls, arch) return (model_cls, arch)
# Fallback to transformers impl
if model_config.model_impl in (ModelImpl.AUTO, ModelImpl.TRANSFORMERS):
arch = self._try_resolve_transformers(architectures[0],
model_config)
if arch is not None:
model_cls = self._try_load_model_cls(arch)
if model_cls is not None:
return (model_cls, arch)
return self._raise_for_unsupported(architectures) return self._raise_for_unsupported(architectures)
def is_text_generation_model( def is_text_generation_model(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool: ) -> bool:
model_cls, _ = self.inspect_model_cls(architectures) model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.is_text_generation_model return model_cls.is_text_generation_model
def is_pooling_model( def is_pooling_model(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool: ) -> bool:
model_cls, _ = self.inspect_model_cls(architectures) model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.is_pooling_model return model_cls.is_pooling_model
def is_cross_encoder_model( def is_cross_encoder_model(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool: ) -> bool:
model_cls, _ = self.inspect_model_cls(architectures) model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.supports_cross_encoding return model_cls.supports_cross_encoding
def is_multimodal_model( def is_multimodal_model(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool: ) -> bool:
model_cls, _ = self.inspect_model_cls(architectures) model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.supports_multimodal return model_cls.supports_multimodal
def supports_multimodal_raw_input( def supports_multimodal_raw_input(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool: ) -> bool:
model_cls, _ = self.inspect_model_cls(architectures) model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.supports_multimodal_raw_input return model_cls.supports_multimodal_raw_input
def is_pp_supported_model( def is_pp_supported_model(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool: ) -> bool:
model_cls, _ = self.inspect_model_cls(architectures) model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.supports_pp return model_cls.supports_pp
def model_has_inner_state( def model_has_inner_state(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool: ) -> bool:
model_cls, _ = self.inspect_model_cls(architectures) model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.has_inner_state return model_cls.has_inner_state
def is_attention_free_model( def is_attention_free_model(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool: ) -> bool:
model_cls, _ = self.inspect_model_cls(architectures) model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.is_attention_free return model_cls.is_attention_free
def is_hybrid_model( def is_hybrid_model(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool: ) -> bool:
model_cls, _ = self.inspect_model_cls(architectures) model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.is_hybrid return model_cls.is_hybrid
def is_noops_model( def is_noops_model(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool: ) -> bool:
model_cls, _ = self.inspect_model_cls(architectures) model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.has_noops return model_cls.has_noops
def is_transcription_model( def is_transcription_model(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool: ) -> bool:
model_cls, _ = self.inspect_model_cls(architectures) model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.supports_transcription return model_cls.supports_transcription
def is_transcription_only_model( def is_transcription_only_model(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool: ) -> bool:
model_cls, _ = self.inspect_model_cls(architectures) model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.supports_transcription_only return model_cls.supports_transcription_only
def is_v1_compatible( def is_v1_compatible(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool: ) -> bool:
model_cls, _ = self.inspect_model_cls(architectures) model_cls, _ = self.inspect_model_cls(architectures, model_config)
return not model_cls.supports_v0_only return not model_cls.supports_v0_only
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from typing import Optional, Union
from transformers.dynamic_module_utils import get_class_from_dynamic_module
import vllm.envs as envs
from vllm.logger import init_logger
logger = init_logger(__name__)
def try_get_class_from_dynamic_module(
class_reference: str,
pretrained_model_name_or_path: str,
cache_dir: Optional[Union[str, os.PathLike]] = None,
force_download: bool = False,
resume_download: Optional[bool] = None,
proxies: Optional[dict[str, str]] = None,
token: Optional[Union[bool, str]] = None,
revision: Optional[str] = None,
local_files_only: bool = False,
repo_type: Optional[str] = None,
code_revision: Optional[str] = None,
warn_on_fail: bool = True,
**kwargs,
) -> Optional[type]:
"""
As [transformers.dynamic_module_utils.get_class_from_dynamic_module][],
but ignoring any errors.
"""
try:
return get_class_from_dynamic_module(
class_reference,
pretrained_model_name_or_path,
cache_dir=cache_dir,
force_download=force_download,
resume_download=resume_download,
proxies=proxies,
token=token,
revision=revision,
local_files_only=local_files_only,
repo_type=repo_type,
code_revision=code_revision,
**kwargs,
)
except Exception:
location = "ModelScope" if envs.VLLM_USE_MODELSCOPE else "HF Hub"
if warn_on_fail:
logger.warning(
"Unable to load %s from %s on %s.",
class_reference,
pretrained_model_name_or_path,
location,
exc_info=True,
)
return None
...@@ -3,6 +3,8 @@ ...@@ -3,6 +3,8 @@
from typing import Optional from typing import Optional
from typing_extensions import assert_never
from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens, from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens,
...@@ -108,6 +110,14 @@ class TokenizerGroup: ...@@ -108,6 +110,14 @@ class TokenizerGroup:
def init_tokenizer_from_configs(model_config: ModelConfig, def init_tokenizer_from_configs(model_config: ModelConfig,
scheduler_config: SchedulerConfig, scheduler_config: SchedulerConfig,
lora_config: Optional[LoRAConfig]): lora_config: Optional[LoRAConfig]):
runner_type = model_config.runner_type
if runner_type == "generate" or runner_type == "draft":
truncation_side = "left"
elif runner_type == "pooling":
truncation_side = "right"
else:
assert_never(runner_type)
return TokenizerGroup( return TokenizerGroup(
tokenizer_id=model_config.tokenizer, tokenizer_id=model_config.tokenizer,
enable_lora=bool(lora_config), enable_lora=bool(lora_config),
...@@ -117,4 +127,4 @@ def init_tokenizer_from_configs(model_config: ModelConfig, ...@@ -117,4 +127,4 @@ def init_tokenizer_from_configs(model_config: ModelConfig,
tokenizer_mode=model_config.tokenizer_mode, tokenizer_mode=model_config.tokenizer_mode,
trust_remote_code=model_config.trust_remote_code, trust_remote_code=model_config.trust_remote_code,
revision=model_config.tokenizer_revision, revision=model_config.tokenizer_revision,
truncation_side=model_config.truncation_side) truncation_side=truncation_side)
...@@ -127,8 +127,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -127,8 +127,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self.is_multimodal_model = model_config.is_multimodal_model self.is_multimodal_model = model_config.is_multimodal_model
self.is_pooling_model = model_config.pooler_config is not None self.is_pooling_model = model_config.pooler_config is not None
self.is_encoder_only_model = False self.is_encoder_only_model = False
self.model_supports_multimodal_raw_input = ( self.is_multimodal_raw_input_supported = (
model_config.model_supports_multimodal_raw_input) model_config.is_multimodal_raw_input_supported)
self.max_model_len = model_config.max_model_len self.max_model_len = model_config.max_model_len
self.max_num_tokens = scheduler_config.max_num_batched_tokens self.max_num_tokens = scheduler_config.max_num_batched_tokens
self.max_num_reqs = scheduler_config.max_num_seqs self.max_num_reqs = scheduler_config.max_num_seqs
...@@ -583,7 +583,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -583,7 +583,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
) -> dict[str, Any]: ) -> dict[str, Any]:
model_kwargs: dict[str, Any] = {} model_kwargs: dict[str, Any] = {}
if self.model_supports_multimodal_raw_input: if self.is_multimodal_raw_input_supported:
# This model requires the raw multimodal data in input. # This model requires the raw multimodal data in input.
if scheduler_output: if scheduler_output:
multi_modal_kwargs_list = [] multi_modal_kwargs_list = []
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment