Unverified Commit 86ae693f authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Deprecation][2/N] Replace `--task` with `--runner` and `--convert` (#21470)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 8f605ee3
......@@ -24,13 +24,8 @@ eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
def _create_proposer(method: str, k: int) -> EagleProposer:
model_config = ModelConfig(model=model_dir,
task="generate",
max_model_len=100,
tokenizer=model_dir,
tokenizer_mode="auto",
dtype="auto",
seed=None,
trust_remote_code=False)
runner="generate",
max_model_len=100)
# Choose model directory based on method
draft_model_dir = eagle_dir if method == "eagle" else eagle3_dir
......
......@@ -44,14 +44,7 @@ def test_ngram_proposer():
def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
# Dummy model config. Just to set max_model_len.
model_config = ModelConfig(model="facebook/opt-125m",
task="generate",
max_model_len=100,
tokenizer="facebook/opt-125m",
tokenizer_mode="auto",
dtype="auto",
seed=None,
trust_remote_code=False)
model_config = ModelConfig(model="facebook/opt-125m")
return NgramProposer(
vllm_config=VllmConfig(model_config=model_config,
speculative_config=SpeculativeConfig.
......
......@@ -26,10 +26,6 @@ def get_vllm_config():
)
model_config = ModelConfig(
model="facebook/opt-125m",
task="generate",
tokenizer="facebook/opt-125m",
tokenizer_mode="auto",
trust_remote_code=True,
dtype="bfloat16", # TPUs typically use bfloat16
seed=42,
)
......
......@@ -76,10 +76,6 @@ def get_vllm_config():
)
model_config = ModelConfig(
model="facebook/opt-125m",
task="generate",
tokenizer="facebook/opt-125m",
tokenizer_mode="auto",
trust_remote_code=True,
dtype="float16",
seed=42,
)
......
......@@ -26,7 +26,7 @@ from pydantic import (ConfigDict, SkipValidation, TypeAdapter, field_validator,
from pydantic.dataclasses import dataclass
from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
from torch.distributed import ProcessGroup, ReduceOp
from typing_extensions import Self, runtime_checkable
from typing_extensions import Self, assert_never, runtime_checkable
import vllm.envs as envs
from vllm import version
......@@ -102,12 +102,63 @@ RunnerOption = Literal["auto", "generate", "pooling", "draft"]
RunnerType = Literal["generate", "pooling", "draft"]
_RUNNER_TASKS: dict[RunnerType, list[_ResolvedTask]] = {
ConvertOption = Literal["auto", "none", "embed", "classify", "reward"]
ConvertType = Literal["none", "embed", "classify", "reward"]
_RUNNER_TASKS: dict[RunnerType, list[TaskOption]] = {
"generate": ["generate", "transcription"],
"pooling": ["encode", "embed", "classify", "reward"],
"pooling": ["embedding", "embed", "classify", "score", "reward"],
"draft": ["draft"],
}
_RUNNER_CONVERTS: dict[RunnerType, list[ConvertType]] = {
"generate": [],
"pooling": ["embed", "classify", "reward"],
"draft": [],
}
# Some model suffixes are based on auto classes from Transformers:
# https://huggingface.co/docs/transformers/en/model_doc/auto
# NOTE: Items higher on this list priority over lower ones
_SUFFIX_TO_DEFAULTS: list[tuple[str, tuple[RunnerType, ConvertType]]] = [
("ForCausalLM", ("generate", "none")),
("ForConditionalGeneration", ("generate", "none")),
("ChatModel", ("generate", "none")),
("LMHeadModel", ("generate", "none")),
("ForTextEncoding", ("pooling", "embed")),
("EmbeddingModel", ("pooling", "embed")),
("ForSequenceClassification", ("pooling", "classify")),
("ForAudioClassification", ("pooling", "classify")),
("ForImageClassification", ("pooling", "classify")),
("ForVideoClassification", ("pooling", "classify")),
("ClassificationModel", ("pooling", "classify")),
("ForRewardModeling", ("pooling", "reward")),
("RewardModel", ("pooling", "reward")),
# Let other `*Model`s take priority
("Model", ("pooling", "embed")),
]
def iter_architecture_defaults():
yield from _SUFFIX_TO_DEFAULTS
def try_match_architecture_defaults(
architecture: str,
*,
runner_type: Optional[RunnerType] = None,
convert_type: Optional[ConvertType] = None,
) -> Optional[tuple[str, tuple[RunnerType, ConvertType]]]:
for suffix, (default_runner_type,
default_convert_type) in iter_architecture_defaults():
if ((runner_type is None or runner_type == default_runner_type) and
(convert_type is None or convert_type == default_convert_type)
and architecture.endswith(suffix)):
return suffix, (default_runner_type, default_convert_type)
return None
@runtime_checkable
class SupportsHash(Protocol):
......@@ -236,11 +287,16 @@ class ModelConfig:
runner: RunnerOption = "auto"
"""The type of model runner to use. Each vLLM instance only supports one
model runner, even if the same model can be used for multiple types."""
task: TaskOption = "auto"
"""The task to use the model for. If the model supports more than one
model runner, this is used to select which model runner to run.
Note that the model may support other tasks using the same model runner."""
convert: ConvertOption = "auto"
"""Convert the model using adapters defined in
[vllm.model_executor.models.adapters][]. The most common use case is to
adapt a text generation model to be used for pooling tasks."""
task: Optional[TaskOption] = None
"""[DEPRECATED] The task to use the model for. If the model supports more
than one model runner, this is used to select which model runner to run.
Note that the model may support other tasks using the same model runner.
"""
tokenizer: SkipValidation[str] = None # type: ignore
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
name or path will be used."""
......@@ -558,48 +614,103 @@ class ModelConfig:
self.hf_image_processor_config = get_hf_image_processor_config(
self.model, hf_token=self.hf_token, revision=self.revision)
# For pooling models, self.task is used to indicate the
# user-selected task
if self.task == "score":
if self._is_classify_task(self.architectures):
self.task = "classify"
architectures = self.architectures
registry = self.registry
is_generative_model = registry.is_text_generation_model(
architectures, self)
is_pooling_model = registry.is_pooling_model(architectures, self)
def _task_to_convert(task: TaskOption) -> ConvertType:
if task == "embedding" or task == "embed":
return "embed"
if task == "classify":
return "classify"
if task == "reward":
return "reward"
if task == "score":
new_task = self._get_default_pooling_task(architectures)
return "classify" if new_task == "classify" else "embed"
return "none"
if self.task is not None:
runner: RunnerOption = "auto"
convert: ConvertOption = "auto"
msg_prefix = ("The 'task' option has been deprecated and will be "
"removed in v0.13.0 or v1.0, whichever comes first.")
msg_hint = "Please remove this option."
is_generative_task = self.task in _RUNNER_TASKS["generate"]
is_pooling_task = self.task in _RUNNER_TASKS["pooling"]
if is_generative_model and is_pooling_model:
if is_generative_task:
runner = "generate"
convert = "auto"
msg_hint = ("Please replace this option with `--runner "
"generate` to continue using this model "
"as a generative model.")
elif is_pooling_task:
runner = "pooling"
convert = "auto"
msg_hint = ("Please replace this option with `--runner "
"pooling` to continue using this model "
"as a pooling model.")
else: # task == "auto"
pass
elif is_generative_model or is_pooling_model:
if is_generative_task:
runner = "generate"
convert = "auto"
msg_hint = "Please remove this option"
elif is_pooling_task:
runner = "pooling"
convert = _task_to_convert(self.task)
msg_hint = ("Please replace this option with `--convert "
f"{convert}` to continue using this model "
"as a pooling model.")
else: # task == "auto"
pass
else:
self.task = "embed"
elif self.task == "embedding":
msg = ("The 'embedding' task has been renamed to 'embed', please "
"use the new name. The old name will be removed in v1.0.")
raise AssertionError("The model should be a generative or "
"pooling model when task is set to "
f"{self.task!r}.")
self.runner = runner
self.convert = convert
msg = f"{msg_prefix} {msg_hint}"
warnings.warn(msg, DeprecationWarning, stacklevel=2)
self.task = "embed"
self.runner_type = self._get_runner_type(architectures, self.runner)
self.convert_type = self._get_convert_type(architectures,
self.runner_type,
self.convert)
if self.runner_type == "generate" and not is_generative_model:
generate_converts = _RUNNER_CONVERTS["generate"]
if self.convert_type not in generate_converts:
# Currently we don't have any converters for generative models
raise ValueError(
"This model does not support `--runner generate`.")
if self.runner_type == "pooling" and not is_pooling_model:
pooling_converts = _RUNNER_CONVERTS["pooling"]
if self.convert_type not in pooling_converts:
convert_option = "<" + "|".join(pooling_converts) + ">"
raise ValueError(
"This model does not support `--runner pooling`. "
f"You can pass `--convert {convert_option} to adapt "
"it into a pooling model.")
model_info, arch = self.registry.inspect_model_cls(self.architectures)
self.supported_tasks = self._get_supported_tasks(
architectures, self.runner_type, self.convert_type)
# Note: Initialize these attributes early because transformers fallback
# may fail to load dynamic modules in child processes
model_info, arch = registry.inspect_model_cls(architectures, self)
self._model_info = model_info
self._architecture = arch
all_supported_tasks = self._get_supported_tasks(self.task)
logger.debug("Tasks supported by runner type: %s", all_supported_tasks)
supported_runner_types = self._get_supported_runner_types(
all_supported_tasks)
runner_type = self._resolve_runner(self.runner, self.task,
supported_runner_types,
all_supported_tasks)
logger.debug("Selected runner type: %s", runner_type)
# For pooling models, self.task is used to indicate the
# user-selected task
if runner_type == "pooling" and self.task == "auto":
selected_task = all_supported_tasks[runner_type][-1]
assert selected_task != "encode"
self.task = selected_task
self.supported_runner_types = supported_runner_types
self.runner_type = runner_type
self.supported_tasks = all_supported_tasks[runner_type]
if self.runner_type in ("draft",
"generate") and self.task != "transcription":
self.truncation_side = "left"
else:
self.truncation_side = "right"
logger.info("Resolved architecture: %s", arch)
self.pooler_config = self._init_pooler_config()
......@@ -652,16 +763,10 @@ class ModelConfig:
self.original_max_model_len = self.max_model_len
self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
self.multimodal_config = self._init_multimodal_config()
self.model_supports_multimodal_raw_input = (
self.registry.supports_multimodal_raw_input(self.architectures))
if not self.skip_tokenizer_init:
self._verify_tokenizer_mode()
self.is_attention_free = self._init_attention_free()
self.is_hybrid = self._init_is_hybrid()
self.has_noops = self._init_has_noops()
self.has_inner_state = self._init_has_inner_state()
if (not current_platform.is_neuron() and self.override_neuron_config):
raise ValueError(
"`override_neuron_config` is only supported on Neuron.")
......@@ -702,30 +807,13 @@ class ModelConfig:
@property
def architectures(self) -> list[str]:
# architectures in the model config.
architectures = getattr(self.hf_config, "architectures", [])
# The registry assumes that it can always inspect the vLLM model class
# for a given architecture. This assumption breaks down for the
# Transformers backend, which may use a different class depending on
# the model type. To work around this, we add the correct Transformers
# backend class to the architectures list. We must do this here because
# we need access to the `hf_config` to determine the backend class.
transformers_backend_cls = self._get_transformers_backend_cls()
if (self.model_impl != ModelImpl.VLLM.value
and all(arch != transformers_backend_cls
for arch in architectures)):
architectures.append(transformers_backend_cls)
return architectures
return getattr(self.hf_config, "architectures", [])
@property
def architecture(self) -> str:
# The architecture vllm actually used.
"""The architecture vllm actually used."""
return self._architecture
@property
def model_info(self):
return self._model_info
def maybe_pull_model_tokenizer_for_s3(self, model: str,
tokenizer: str) -> None:
"""Pull model/tokenizer from S3 to temporary directory when needed.
......@@ -763,7 +851,7 @@ class ModelConfig:
self.tokenizer = s3_tokenizer.dir
def _init_multimodal_config(self) -> Optional["MultiModalConfig"]:
if self.registry.is_multimodal_model(self.architectures):
if self.registry.is_multimodal_model(self.architectures, self):
return MultiModalConfig(
limit_per_prompt=self.limit_mm_per_prompt,
media_io_kwargs=self.media_io_kwargs,
......@@ -819,19 +907,6 @@ class ModelConfig:
return None
def _init_attention_free(self) -> bool:
return self.registry.is_attention_free_model(self.architectures)
def _init_is_hybrid(self) -> bool:
return self.registry.is_hybrid_model(self.architectures)
def _init_has_noops(self) -> bool:
architectures = getattr(self.hf_config, "architectures", [])
return self.registry.is_noops_model(architectures)
def _init_has_inner_state(self) -> bool:
return self.registry.model_has_inner_state(self.architectures)
def _verify_tokenizer_mode(self) -> None:
tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower())
if tokenizer_mode not in get_args(TokenizerMode):
......@@ -840,155 +915,168 @@ class ModelConfig:
f"one of {get_args(TokenizerMode)}.")
self.tokenizer_mode = tokenizer_mode
def _is_classify_task(self, architectures: list[str]):
def _get_default_runner_type(
self,
architectures: list[str],
) -> RunnerType:
registry = self.registry
# Some Sentence Transformers models use *ForCausalLM archs
if get_pooling_config(self.model, self.revision):
return "pooling"
for arch in architectures:
if arch.endswith("ForSequenceClassification"):
return True
return self.registry.is_cross_encoder_model(architectures)
if arch in registry.get_supported_archs():
if registry.is_pooling_model(architectures, self):
return "pooling"
if registry.is_text_generation_model(architectures, self):
return "generate"
def _get_preferred_pooling_task(
match = try_match_architecture_defaults(arch)
if match:
_, (runner_type, _) = match
return runner_type
return "generate"
def _get_runner_type(
self,
architectures: list[str],
) -> _ResolvedTask:
model_id = self.model
if get_pooling_config(model_id, self.revision):
return "embed"
if self.registry.is_transcription_model(architectures):
return "transcription"
runner: RunnerOption,
) -> RunnerType:
if runner != "auto":
return runner
suffix_to_preferred_task: list[tuple[str, _ResolvedTask]] = [
# Other models follow this pattern
("EmbeddingModel", "embed"),
("RewardModel", "reward"),
]
runner_type = self._get_default_runner_type(architectures)
logger.info(
"Resolved `--runner auto` to `--runner %s`. "
"Pass the value explicitly to silence this message.", runner_type)
for suffix, pref_task in suffix_to_preferred_task:
if self.architecture.endswith(suffix):
return pref_task
return runner_type
def _get_default_convert_type(
self,
architectures: list[str],
runner_type: RunnerType,
) -> ConvertType:
registry = self.registry
for arch in architectures:
if arch in registry.get_supported_archs():
if (runner_type == "generate"
and registry.is_text_generation_model(
architectures, self)):
return "none"
if (runner_type == "pooling"
and registry.is_pooling_model(architectures, self)):
return "none"
match = try_match_architecture_defaults(arch,
runner_type=runner_type)
if match:
_, (_, convert_type) = match
return convert_type
# This is to handle Sentence Transformers models that use *ForCausalLM
# and also multi-modal pooling models which are not defined as
# Sentence Transformers models
if runner_type == "pooling":
return "embed"
return "none"
def _get_convert_type(
self,
architectures: list[str],
runner_type: RunnerType,
convert: ConvertOption,
) -> ConvertType:
if convert != "auto":
return convert
convert_type = self._get_default_convert_type(architectures,
runner_type)
logger.info(
"Resolved `--convert auto` to `--convert %s`. "
"Pass the value explicitly to silence this message.", convert_type)
return convert_type
def _get_supported_generation_tasks(
self,
task_option: TaskOption,
architectures: list[str],
convert_type: ConvertType,
) -> list[_ResolvedTask]:
registry = self.registry
architectures = self.architectures
if registry.is_transcription_only_model(architectures):
if registry.is_transcription_only_model(architectures, self):
return ["transcription"]
# TODO: Use get_supported_generation_tasks once V0 is removed
supported_tasks = list[_ResolvedTask]()
if registry.is_text_generation_model(architectures):
if (registry.is_text_generation_model(architectures, self)
or convert_type in _RUNNER_CONVERTS["generate"]):
supported_tasks.append("generate")
if registry.is_transcription_model(architectures):
if registry.is_transcription_model(architectures, self):
supported_tasks.append("transcription")
return supported_tasks
def _get_default_pooling_task(
self,
architectures: list[str],
) -> Literal["embed", "classify", "reward"]:
if self.registry.is_cross_encoder_model(architectures, self):
return "classify"
for arch in architectures:
match = try_match_architecture_defaults(arch,
runner_type="pooling")
if match:
_, (_, convert_type) = match
assert convert_type != "none"
return convert_type
return "embed"
def _get_supported_pooling_tasks(
self,
task_option: TaskOption,
architectures: list[str],
convert_type: ConvertType,
) -> list[_ResolvedTask]:
registry = self.registry
architectures = self.architectures
# TODO: Use get_supported_pooling_tasks once V0 is removed
supported_tasks = list[_ResolvedTask]()
if registry.is_pooling_model(architectures):
if (registry.is_pooling_model(architectures, self)
or convert_type in _RUNNER_CONVERTS["pooling"]):
supported_tasks.append("encode")
# For now, users must specify the task (other than "pooling")
# to use for pooling models
if task_option == "auto":
preferred_task = self._get_preferred_pooling_task(
architectures)
supported_tasks.append(preferred_task)
elif task_option in _RUNNER_TASKS["pooling"]:
supported_tasks.append(cast(_ResolvedTask, task_option))
extra_task = (self._get_default_pooling_task(architectures)
if convert_type == "none" else convert_type)
supported_tasks.append(extra_task)
return supported_tasks
def _get_supported_tasks(
self,
task_option: TaskOption,
) -> dict[RunnerType, list[_ResolvedTask]]:
if self._is_classify_task(self.architectures):
return {"generate": [], "pooling": ["classify"], "draft": []}
else:
return {
"generate": self._get_supported_generation_tasks(task_option),
"pooling": self._get_supported_pooling_tasks(task_option),
"draft": ["draft"]
}
def _get_supported_runner_types(
self,
supported_tasks: dict[RunnerType, list[_ResolvedTask]],
) -> set[RunnerType]:
return {
runner
for runner, runner_tasks in supported_tasks.items()
if len(runner_tasks) > 0
}
def _resolve_runner(
self,
runner_option: RunnerOption,
task_option: TaskOption,
supported_runner_types: set[RunnerType],
supported_tasks: dict[RunnerType, list[_ResolvedTask]],
) -> RunnerType:
if not supported_runner_types:
raise ValueError("This model does not support any model runners!")
if runner_option != "auto":
if runner_option not in supported_runner_types:
raise ValueError(
f"This model does not support runner={runner_option!r}. "
f"Available runners: {supported_runner_types}")
return runner_option
if task_option != "auto":
for runner, runner_tasks in supported_tasks.items():
if task_option in runner_tasks:
return runner
else:
task_runner: RunnerType = next(
runner for runner, tasks in _RUNNER_TASKS.items()
if task_option in tasks)
raise ValueError(
f"This model does not support task={task_option!r}. "
f"Available tasks for runner={task_runner!r}: "
f"{supported_tasks[task_runner]}")
if "classify" in supported_tasks.get("pooling", []):
# When multiple pooling tasks are present, default to
# pooling (eg cross-encoder) for non-standard architectures.
return "pooling"
suffix_to_preferred_runner: list[tuple[str, RunnerType]] = [
("ForCausalLM", "generate"),
("ForConditionalGeneration", "generate"),
("ChatModel", "generate"),
("LMHeadModel", "generate"),
("EmbeddingModel", "pooling"),
("RewardModel", "pooling"),
]
for suffix, pref_runner in suffix_to_preferred_runner:
if self.architecture.endswith(
suffix) and pref_runner in supported_runner_types:
return pref_runner
if "generate" in supported_runner_types:
return "generate"
if "pooling" in supported_runner_types:
return "pooling"
architectures: list[str],
runner_type: RunnerType,
convert_type: ConvertType,
) -> list[_ResolvedTask]:
if runner_type == "generate":
return self._get_supported_generation_tasks(
architectures, convert_type)
if runner_type == "pooling":
return self._get_supported_pooling_tasks(architectures,
convert_type)
if runner_type == "draft":
return ["draft"]
raise AssertionError("This line should not be reached")
assert_never(runner_type)
def _parse_quant_hf_config(self):
quant_cfg = getattr(self.hf_config, "quantization_config", None)
......@@ -1216,7 +1304,8 @@ class ModelConfig:
pipeline_parallel_size = parallel_config.pipeline_parallel_size
if pipeline_parallel_size > 1:
if not self.registry.is_pp_supported_model(self.architectures):
if not self.registry.is_pp_supported_model(self.architectures,
self):
raise NotImplementedError(
"Pipeline parallelism is not supported for this model. "
"Supported models implement the `SupportsPP` interface.")
......@@ -1558,16 +1647,40 @@ class ModelConfig:
@property
def is_cross_encoder(self) -> bool:
return self.task == "classify"
return (self._model_info.supports_cross_encoding
or self.convert_type == "classify")
@property
def use_mla(self) -> bool:
return self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE
def is_pp_supported(self) -> bool:
return self._model_info.supports_pp
@property
def is_multimodal_raw_input_supported(self) -> bool:
return self._model_info.supports_multimodal_raw_input
@property
def is_attention_free(self) -> bool:
return self._model_info.is_attention_free
@property
def is_hybrid(self) -> bool:
return self._model_info.is_hybrid
@property
def has_noops(self) -> bool:
return self._model_info.has_noops
@property
def has_inner_state(self):
return self._model_info.has_inner_state
@property
def is_v1_compatible(self) -> bool:
architectures = getattr(self.hf_config, "architectures", [])
return me_models.ModelRegistry.is_v1_compatible(architectures)
return not self._model_info.supports_v0_only
@property
def use_mla(self) -> bool:
return self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE
@property
def is_matryoshka(self) -> bool:
......@@ -4769,7 +4882,10 @@ class VllmConfig:
self.scheduler_config.max_model_len = max_model_len
def try_verify_and_update_config(self):
architecture = getattr(self.model_config, "architecture", None)
if self.model_config is None:
return
architecture = self.model_config.architecture
if architecture is None:
return
......@@ -4782,7 +4898,7 @@ class VllmConfig:
if self.model_config.is_hybrid:
HybridAttentionMambaModelConfig.verify_and_update_config(self)
if self.model_config.task == "classify":
if self.model_config.convert_type == "classify":
# Maybe convert ForCausalLM into ForSequenceClassification model.
from vllm.model_executor.models.adapters import (
SequenceClassificationConfig)
......
......@@ -22,14 +22,15 @@ from typing_extensions import TypeIs
import vllm.envs as envs
from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
ConfigFormat, ConfigType, DecodingConfig,
DetailedTraceModules, Device, DeviceConfig,
DistributedExecutorBackend, GuidedDecodingBackend,
GuidedDecodingBackendV1, HfOverrides, KVEventsConfig,
KVTransferConfig, LoadConfig, LogprobsMode,
LoRAConfig, ModelConfig, ModelDType, ModelImpl,
MultiModalConfig, ObservabilityConfig, ParallelConfig,
PoolerConfig, PrefixCachingHashAlgo, SchedulerConfig,
ConfigFormat, ConfigType, ConvertOption,
DecodingConfig, DetailedTraceModules, Device,
DeviceConfig, DistributedExecutorBackend,
GuidedDecodingBackend, GuidedDecodingBackendV1,
HfOverrides, KVEventsConfig, KVTransferConfig,
LoadConfig, LogprobsMode, LoRAConfig, ModelConfig,
ModelDType, ModelImpl, MultiModalConfig,
ObservabilityConfig, ParallelConfig, PoolerConfig,
PrefixCachingHashAlgo, RunnerOption, SchedulerConfig,
SchedulerPolicy, SpeculativeConfig, TaskOption,
TokenizerMode, VllmConfig, get_attr_docs, get_field)
from vllm.logger import init_logger
......@@ -270,7 +271,9 @@ class EngineArgs:
str, List[str]]] = ModelConfig.served_model_name
tokenizer: Optional[str] = ModelConfig.tokenizer
hf_config_path: Optional[str] = ModelConfig.hf_config_path
task: TaskOption = ModelConfig.task
runner: RunnerOption = ModelConfig.runner
convert: ConvertOption = ModelConfig.convert
task: Optional[TaskOption] = ModelConfig.task
skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode
......@@ -461,7 +464,11 @@ class EngineArgs:
)
if not ('serve' in sys.argv[1:] and '--help' in sys.argv[1:]):
model_group.add_argument("--model", **model_kwargs["model"])
model_group.add_argument("--task", **model_kwargs["task"])
model_group.add_argument("--runner", **model_kwargs["runner"])
model_group.add_argument("--convert", **model_kwargs["convert"])
model_group.add_argument("--task",
**model_kwargs["task"],
deprecated=True)
model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
model_group.add_argument("--tokenizer-mode",
**model_kwargs["tokenizer_mode"])
......@@ -870,6 +877,8 @@ class EngineArgs:
return ModelConfig(
model=self.model,
hf_config_path=self.hf_config_path,
runner=self.runner,
convert=self.convert,
task=self.task,
tokenizer=self.tokenizer,
tokenizer_mode=self.tokenizer_mode,
......
......@@ -20,8 +20,8 @@ from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
create_sort_beams_key_function)
from vllm.config import (CompilationConfig, ModelDType, TokenizerMode,
is_init_field)
from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
TaskOption)
from vllm.engine.arg_utils import (ConvertOption, EngineArgs, HfOverrides,
PoolerConfig, RunnerOption)
from vllm.engine.llm_engine import LLMEngine
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
ChatTemplateContentFormatOption,
......@@ -170,7 +170,8 @@ class LLM:
self,
model: str,
*,
task: TaskOption = "auto",
runner: RunnerOption = "auto",
convert: ConvertOption = "auto",
tokenizer: Optional[str] = None,
tokenizer_mode: TokenizerMode = "auto",
skip_tokenizer_init: bool = False,
......@@ -244,7 +245,8 @@ class LLM:
engine_args = EngineArgs(
model=model,
task=task,
runner=runner,
convert=convert,
tokenizer=tokenizer,
tokenizer_mode=tokenizer_mode,
skip_tokenizer_init=skip_tokenizer_init,
......@@ -459,18 +461,10 @@ class LLM:
model_config = self.llm_engine.model_config
runner_type = model_config.runner_type
if runner_type != "generate":
messages = [
"LLM.generate() is only supported for generative models."
]
if "generate" in model_config.supported_runner_types:
messages.append(
"Your model supports the 'generate' runner, but is "
f"currently initialized for the '{runner_type}' runner. "
"Please initialize vLLM using `--task generate` or "
"`--task transcription`.")
raise ValueError(" ".join(messages))
raise ValueError(
"LLM.generate() is only supported for generative models. "
"Try passing `--runner generate` to use the model as a "
"generative model.")
if prompt_token_ids is not None:
parsed_prompts = self._convert_v1_inputs(
......@@ -497,7 +491,8 @@ class LLM:
truncate_prompt_tokens = None
if isinstance(sampling_params, SamplingParams):
truncate_prompt_tokens = sampling_params.truncate_prompt_tokens
_validate_truncation_size(self.llm_engine.model_config.max_model_len,
_validate_truncation_size(model_config.max_model_len,
truncate_prompt_tokens, tokenization_kwargs)
# Add any modality specific loras to the corresponding prompts
......@@ -1100,16 +1095,10 @@ class LLM:
model_config = self.llm_engine.model_config
runner_type = model_config.runner_type
if runner_type != "pooling":
messages = ["LLM.encode() is only supported for pooling models."]
if "pooling" in model_config.supported_runner_types:
messages.append(
"Your model supports the 'pooling' runner, but is "
f"currently initialized for the '{runner_type}' runner. "
"Please initialize vLLM using `--task embed`, "
"`--task classify`, `--task score` etc.")
raise ValueError(" ".join(messages))
raise ValueError(
"LLM.encode() is only supported for pooling models. "
"Try passing `--runner pooling` to use the model as a "
"pooling model.")
if prompt_token_ids is not None:
parsed_prompts = self._convert_v1_inputs(
......@@ -1183,8 +1172,9 @@ class LLM:
embedding vectors in the same order as the input prompts.
"""
if "embed" not in self.supported_tasks:
raise ValueError("Embedding API is not supported by this model. "
"Please set `--task embed`.")
raise ValueError(
"Embedding API is not supported by this model. "
"Try converting the model using `--convert embed`.")
items = self.encode(
prompts,
......@@ -1229,7 +1219,7 @@ class LLM:
if "classify" not in self.supported_tasks:
raise ValueError(
"Classification API is not supported by this model. "
"Please set `--task classify`.")
"Try converting the model using `--convert classify`.")
items = self.encode(
prompts,
......@@ -1283,27 +1273,26 @@ class LLM:
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
) -> list[ScoringRequestOutput]:
model_config = self.llm_engine.model_config
if isinstance(tokenizer, MistralTokenizer):
raise ValueError(
"Score API is only enabled for `--task embed or score`")
"Score API is not supported for Mistral tokenizer")
if len(data_1) == 1:
data_1 = data_1 * len(data_2)
pooling_params = PoolingParams(task="score")
tokenization_kwargs: dict[str, Any] = {}
_validate_truncation_size(self.llm_engine.model_config.max_model_len,
_validate_truncation_size(model_config.max_model_len,
truncate_prompt_tokens, tokenization_kwargs)
parsed_prompts = []
input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
if self.llm_engine.model_config.is_multimodal_model:
model_config = self.llm_engine.model_config
if model_config.is_multimodal_model:
for q, d in input_pairs:
_, engine_prompt = get_score_prompt(
model_config=model_config,
......@@ -1314,11 +1303,9 @@ class LLM:
)
parsed_prompts.append(engine_prompt)
else:
for q, t in input_pairs:
if self.llm_engine.model_config.use_pad_token:
if model_config.use_pad_token:
# cross_encoder models defaults to using pad_token.
prompt_inputs = tokenizer(
text=q, # type: ignore[arg-type]
......@@ -1396,23 +1383,18 @@ class LLM:
model_config = self.llm_engine.model_config
runner_type = model_config.runner_type
if runner_type != "pooling":
messages = ["LLM.score() is only supported for pooling models."]
if "pooling" in model_config.supported_runner_types:
messages.append(
"Your model supports the 'pooling' runner, but is "
f"currently initialized for the '{runner_type}' runner. "
"Please initialize vLLM using `--task embed`, "
"`--task classify`, `--task score` etc.")
raise ValueError(" ".join(messages))
raise ValueError(
"LLM.score() is only supported for pooling models. "
"Try passing `--runner pooling` to use the model as a "
"pooling model.")
supported_tasks = self.supported_tasks
if all(t not in supported_tasks for t in ("embed", "classify")):
raise ValueError("Score API is not supported by this model. "
"Please set `--task embed` or `--task classify`.")
"Try converting the model using "
"`--convert embed` or `--convert classify`.")
if (model_config.task == "classify"
if (model_config.is_cross_encoder
and getattr(model_config.hf_config, "num_labels", 0) != 1):
raise ValueError("Score API is only enabled for num_labels == 1.")
......@@ -1421,15 +1403,14 @@ class LLM:
# lists of tokens to the `text` and `text_pair` kwargs
tokenizer = self.get_tokenizer()
if not self.llm_engine.model_config.is_multimodal_model:
if not model_config.is_multimodal_model:
def check_data_type(data: Union[SingletonPrompt,
Sequence[SingletonPrompt],
ScoreMultiModalParam]):
if isinstance(data, dict) and "content" in data:
raise ValueError(
f"ScoreMultiModalParam is not supported for {self.llm_engine.model_config.architecture}", # noqa: E501
)
raise ValueError("ScoreMultiModalParam is not supported "
f"for {model_config.architecture}")
check_data_type(data_1)
check_data_type(data_2)
......@@ -1471,7 +1452,7 @@ class LLM:
_validate_score_input_lens(data_1, data_2) # type: ignore[arg-type]
if self.llm_engine.model_config.is_cross_encoder:
if model_config.is_cross_encoder:
return self._cross_encoding_score(
tokenizer,
data_1, # type: ignore[arg-type]
......
......@@ -1734,7 +1734,6 @@ async def init_app_state(
state.openai_serving_models,
request_logger=request_logger,
) if "transcription" in supported_tasks else None
state.task = model_config.task
state.enable_server_load_tracking = args.enable_server_load_tracking
state.server_load_metrics = 0
......
......@@ -9,9 +9,8 @@ from dataclasses import dataclass, field
from typing import Optional
import torch
import transformers
from torch import nn
from transformers.dynamic_module_utils import get_class_from_dynamic_module
from typing_extensions import assert_never
from vllm.attention import Attention
from vllm.config import (ModelConfig, ModelImpl, VllmConfig,
......@@ -20,13 +19,10 @@ from vllm.logger import init_logger
from vllm.model_executor.layers.linear import QKVCrossParallelLinear
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase)
from vllm.model_executor.models import ModelRegistry
from vllm.model_executor.models.adapters import (as_embedding_model,
as_reward_model,
as_seq_cls_model)
from vllm.model_executor.models.interfaces import SupportsQuant
from vllm.model_executor.models.registry import (_PREVIOUSLY_SUPPORTED_MODELS,
_TRANSFORMERS_BACKEND_MODELS)
from vllm.utils import is_pin_memory_available
logger = init_logger(__name__)
......@@ -169,61 +165,6 @@ def device_loading_context(module: torch.nn.Module,
# New parameters or parameters already on target device are untouched
def resolve_transformers_arch(model_config: ModelConfig,
architectures: list[str]):
if model_config.model_impl == ModelImpl.VLLM:
raise ValueError(
"Attempting to resolve architecture from the Transformers library "
"but the model implementation is set to vLLM. This should never "
"happen.")
for i, arch in enumerate(architectures):
if arch in _TRANSFORMERS_BACKEND_MODELS:
continue
if model_config.model_impl == ModelImpl.AUTO:
logger.warning(
"%s has no vLLM implementation, falling back to Transformers "
"implementation. Some features may not be supported and "
"performance may not be optimal.", arch)
auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
None) or dict()
# Make sure that config class is always initialized before model class,
# otherwise the model class won't be able to access the config class,
# the expected auto_map should have correct order like:
# "auto_map": {
# "AutoConfig": "<your-repo-name>--<config-name>",
# "AutoModel": "<your-repo-name>--<config-name>",
# "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
# },
auto_modules = {
name:
get_class_from_dynamic_module(module,
model_config.model,
revision=model_config.revision)
for name, module in sorted(auto_map.items(), key=lambda x: x[0])
}
model_module = getattr(transformers, arch, None)
if model_module is None:
if "AutoModel" not in auto_map:
raise ValueError(
f"Cannot find model module. '{arch}' is not a registered "
"model in the Transformers library (only relevant if the "
"model is meant to be in Transformers) and 'AutoModel' is "
"not present in the model config's 'auto_map' (relevant "
"if the model is custom).")
model_module = auto_modules["AutoModel"]
if not model_module.is_backend_compatible():
raise ValueError(
f"The Transformers implementation of '{arch}' is not "
"compatible with vLLM.")
architectures[i] = model_config._get_transformers_backend_cls()
return architectures
def get_model_architecture(
model_config: ModelConfig) -> tuple[type[nn.Module], str]:
architectures = getattr(model_config.hf_config, "architectures", [])
......@@ -239,56 +180,38 @@ def get_model_architecture(
"bitsandbytes",
]
vllm_supported_archs = ModelRegistry.get_supported_archs()
is_supported = lambda arch: (arch in vllm_supported_archs and arch not in
_TRANSFORMERS_BACKEND_MODELS)
vllm_not_supported = not any(is_supported(arch) for arch in architectures)
if vllm_not_supported:
# try automatic conversion in adapters.py
for arch in architectures:
if not arch.endswith("ForSequenceClassification"):
continue
assert model_config.task == "classify"
causal_lm_arch = arch.replace("ForSequenceClassification",
"ForCausalLM")
causal_lm_arch_vllm_supported = (causal_lm_arch
in vllm_supported_archs)
if not causal_lm_arch_vllm_supported:
continue
architectures = [causal_lm_arch]
vllm_not_supported = False
break
if any(arch in _PREVIOUSLY_SUPPORTED_MODELS for arch in architectures):
previous_version = _PREVIOUSLY_SUPPORTED_MODELS[architectures[0]]
raise ValueError(
f"Model architecture {architectures[0]} was supported"
f" in vLLM until version {previous_version}, and is "
"not supported anymore. Please use an older version"
" of vLLM if you want to use this model architecture.")
if (model_config.model_impl == ModelImpl.TRANSFORMERS or
model_config.model_impl == ModelImpl.AUTO and vllm_not_supported):
architectures = resolve_transformers_arch(model_config, architectures)
logger.debug_once("Resolve transformers arch %s", str(architectures))
elif (model_config.quantization is not None
if (model_config.quantization is not None
and model_config.quantization not in mixtral_supported
and "MixtralForCausalLM" in architectures):
architectures = ["QuantMixtralForCausalLM"]
model_cls, arch = ModelRegistry.resolve_model_cls(architectures)
if model_config.task == "embed":
logger.debug_once("Automatic conversion using `as_embedding_model`.")
model_cls, arch = model_config.registry.resolve_model_cls(
architectures,
model_config=model_config,
)
if arch == model_config._get_transformers_backend_cls():
assert model_config.model_impl != ModelImpl.VLLM
if model_config.model_impl == ModelImpl.AUTO:
logger.warning_once(
"%s has no vLLM implementation, falling back to Transformers "
"implementation. Some features may not be supported and "
"performance may not be optimal.", arch)
convert_type = model_config.convert_type
if convert_type == "none":
pass
elif convert_type == "embed":
logger.debug_once("Converting to embedding model.")
model_cls = as_embedding_model(model_cls)
elif model_config.task == "classify":
logger.debug_once("Automatic conversion using `as_seq_cls_model`.")
elif convert_type == "classify":
logger.debug_once("Converting to sequence classification model.")
model_cls = as_seq_cls_model(model_cls)
elif model_config.task == "reward":
logger.debug_once("Automatic conversion using `as_reward_model`.")
elif convert_type == "reward":
logger.debug_once("Converting to reward model.")
model_cls = as_reward_model(model_cls)
else:
assert_never(convert_type)
return model_cls, arch
......
......@@ -253,8 +253,10 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
dtype=kv_cache_dtype,
use_mla=model_config.use_mla).page_size_bytes
model_cls = ModelRegistry.resolve_model_cls(
model_config._model_info.architecture)[0]
model_cls, _ = ModelRegistry.resolve_model_cls(
model_config.architecture,
model_config=model_config,
)
# get mamba page size
mamba_page_size = MambaSpec(
......
......@@ -12,19 +12,24 @@ import sys
import tempfile
from abc import ABC, abstractmethod
from collections.abc import Set
from dataclasses import asdict, dataclass, field
from dataclasses import dataclass, field
from functools import lru_cache
from typing import Callable, Optional, TypeVar, Union
import torch.nn as nn
import transformers
from vllm.config import (ModelConfig, ModelImpl, iter_architecture_defaults,
try_match_architecture_defaults)
from vllm.logger import init_logger
from vllm.transformers_utils.dynamic_module import (
try_get_class_from_dynamic_module)
from .interfaces import (has_inner_state, has_noops, is_attention_free,
is_hybrid, supports_cross_encoding,
supports_multimodal, supports_multimodal_raw_input,
supports_pp, supports_transcription, supports_v0_only)
from .interfaces_base import is_text_generation_model
from .interfaces_base import is_pooling_model, is_text_generation_model
logger = init_logger(__name__)
......@@ -311,7 +316,7 @@ class _ModelInfo:
return _ModelInfo(
architecture=model.__name__,
is_text_generation_model=is_text_generation_model(model),
is_pooling_model=True, # Can convert any model into a pooling model
is_pooling_model=is_pooling_model(model),
supports_cross_encoding=supports_cross_encoding(model),
supports_multimodal=supports_multimodal(model),
supports_multimodal_raw_input=supports_multimodal_raw_input(model),
......@@ -465,6 +470,16 @@ class _ModelRegistry:
f"Model architectures {architectures} failed "
"to be inspected. Please check the logs for more details.")
for arch in architectures:
if arch in _PREVIOUSLY_SUPPORTED_MODELS:
previous_version = _PREVIOUSLY_SUPPORTED_MODELS[arch]
raise ValueError(
f"Model architecture {arch} was supported in vLLM until "
f"v{previous_version}, and is not supported anymore. "
"Please use an older version of vLLM if you want to "
"use this model architecture.")
raise ValueError(
f"Model architectures {architectures} are not supported for now. "
f"Supported architectures: {all_supported_archs}")
......@@ -477,66 +492,141 @@ class _ModelRegistry:
return _try_load_model_cls(model_arch, self.models[model_arch])
def _try_inspect_model_cls(self, model_arch: str) -> Optional[_ModelInfo]:
if model_arch in self.models:
if model_arch not in self.models:
return None
return _try_inspect_model_cls(model_arch, self.models[model_arch])
if model_arch.endswith("ForSequenceClassification"):
causal_lm_arch = model_arch.replace("ForSequenceClassification",
"ForCausalLM")
if causal_lm_arch not in self.models:
return None
def _try_resolve_transformers(
self,
architecture: str,
model_config: ModelConfig,
) -> Optional[str]:
if architecture in _TRANSFORMERS_BACKEND_MODELS:
return architecture
auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
None) or dict()
# Make sure that config class is always initialized before model class,
# otherwise the model class won't be able to access the config class,
# the expected auto_map should have correct order like:
# "auto_map": {
# "AutoConfig": "<your-repo-name>--<config-name>",
# "AutoModel": "<your-repo-name>--<config-name>",
# "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
# },
for prefix in ("AutoConfig", "AutoModel"):
for name, module in auto_map.items():
if name.startswith(prefix):
try_get_class_from_dynamic_module(
module,
model_config.model,
revision=model_config.revision,
warn_on_fail=False,
)
info = _try_inspect_model_cls(causal_lm_arch,
self.models[causal_lm_arch])
model_module = getattr(transformers, architecture, None)
info = _ModelInfo(**dict(
asdict(info), **{
"architecture": model_arch,
"supports_cross_encoding": True
}))
return info
if model_module is None:
for name, module in auto_map.items():
if name.startswith("AutoModel"):
model_module = try_get_class_from_dynamic_module(
module,
model_config.model,
revision=model_config.revision,
warn_on_fail=True,
)
if model_module is not None:
break
else:
if model_config.model_impl != ModelImpl.TRANSFORMERS:
return None
raise ValueError(
f"Cannot find model module. {architecture!r} is not a "
"registered model in the Transformers library (only "
"relevant if the model is meant to be in Transformers) "
"and 'AutoModel' is not present in the model config's "
"'auto_map' (relevant if the model is custom).")
if not model_module.is_backend_compatible():
if model_config.model_impl != ModelImpl.TRANSFORMERS:
return None
raise ValueError(
f"The Transformers implementation of {architecture!r} "
"is not compatible with vLLM.")
return model_config._get_transformers_backend_cls()
def _normalize_arch(
self,
architecture: str,
model_config: ModelConfig,
) -> str:
if architecture in self.models:
return architecture
# This may be called in order to resolve runner_type and convert_type
# in the first place, in which case we consider the default match
match = try_match_architecture_defaults(
architecture,
runner_type=getattr(model_config, "runner_type", None),
convert_type=getattr(model_config, "convert_type", None),
)
if match:
suffix, _ = match
# Get the name of the base model to convert
for repl_suffix, _ in iter_architecture_defaults():
base_arch = architecture.replace(suffix, repl_suffix)
if base_arch in self.models:
return base_arch
return architecture
def _normalize_archs(
self,
architectures: Union[str, list[str]],
architectures: list[str],
model_config: ModelConfig,
) -> list[str]:
if isinstance(architectures, str):
architectures = [architectures]
if not architectures:
logger.warning("No model architectures are specified")
# filter out support architectures
normalized_arch = list(
filter(lambda model: model in self.models, architectures))
# try automatic conversion in adapters.py
for arch in architectures:
if not arch.endswith("ForSequenceClassification"):
continue
causal_lm_arch = arch.replace("ForSequenceClassification",
"ForCausalLM")
if causal_lm_arch in self.models:
normalized_arch.append(arch)
# NOTE(Isotr0py): Be careful of architectures' order!
# Make sure Transformers backend architecture is at the end of the
# list, otherwise pooling models automatic conversion will fail!
for arch in normalized_arch:
if arch.startswith("TransformersFor"):
normalized_arch.remove(arch)
normalized_arch.append(arch)
return normalized_arch
return [
self._normalize_arch(arch, model_config) for arch in architectures
]
def inspect_model_cls(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> tuple[_ModelInfo, str]:
architectures = self._normalize_archs(architectures)
if isinstance(architectures, str):
architectures = [architectures]
for arch in architectures:
normalized_archs = self._normalize_archs(architectures, model_config)
# Require transformers impl
if model_config.model_impl == ModelImpl.TRANSFORMERS:
arch = self._try_resolve_transformers(architectures[0],
model_config)
if arch is not None:
model_info = self._try_inspect_model_cls(arch)
if model_info is not None:
return (model_info, arch)
for arch, normalized_arch in zip(architectures, normalized_archs):
model_info = self._try_inspect_model_cls(normalized_arch)
if model_info is not None:
return (model_info, arch)
# Fallback to transformers impl
if model_config.model_impl in (ModelImpl.AUTO, ModelImpl.TRANSFORMERS):
arch = self._try_resolve_transformers(architectures[0],
model_config)
if arch is not None:
model_info = self._try_inspect_model_cls(arch)
if model_info is not None:
return (model_info, arch)
......@@ -546,10 +636,32 @@ class _ModelRegistry:
def resolve_model_cls(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> tuple[type[nn.Module], str]:
architectures = self._normalize_archs(architectures)
if isinstance(architectures, str):
architectures = [architectures]
for arch in architectures:
normalized_archs = self._normalize_archs(architectures, model_config)
# Require transformers impl
if model_config.model_impl == ModelImpl.TRANSFORMERS:
arch = self._try_resolve_transformers(architectures[0],
model_config)
if arch is not None:
model_cls = self._try_load_model_cls(arch)
if model_cls is not None:
return (model_cls, arch)
for arch, normalized_arch in zip(architectures, normalized_archs):
model_cls = self._try_load_model_cls(normalized_arch)
if model_cls is not None:
return (model_cls, arch)
# Fallback to transformers impl
if model_config.model_impl in (ModelImpl.AUTO, ModelImpl.TRANSFORMERS):
arch = self._try_resolve_transformers(architectures[0],
model_config)
if arch is not None:
model_cls = self._try_load_model_cls(arch)
if model_cls is not None:
return (model_cls, arch)
......@@ -559,92 +671,105 @@ class _ModelRegistry:
def is_text_generation_model(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool:
model_cls, _ = self.inspect_model_cls(architectures)
model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.is_text_generation_model
def is_pooling_model(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool:
model_cls, _ = self.inspect_model_cls(architectures)
model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.is_pooling_model
def is_cross_encoder_model(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool:
model_cls, _ = self.inspect_model_cls(architectures)
model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.supports_cross_encoding
def is_multimodal_model(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool:
model_cls, _ = self.inspect_model_cls(architectures)
model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.supports_multimodal
def supports_multimodal_raw_input(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool:
model_cls, _ = self.inspect_model_cls(architectures)
model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.supports_multimodal_raw_input
def is_pp_supported_model(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool:
model_cls, _ = self.inspect_model_cls(architectures)
model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.supports_pp
def model_has_inner_state(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool:
model_cls, _ = self.inspect_model_cls(architectures)
model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.has_inner_state
def is_attention_free_model(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool:
model_cls, _ = self.inspect_model_cls(architectures)
model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.is_attention_free
def is_hybrid_model(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool:
model_cls, _ = self.inspect_model_cls(architectures)
model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.is_hybrid
def is_noops_model(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool:
model_cls, _ = self.inspect_model_cls(architectures)
model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.has_noops
def is_transcription_model(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool:
model_cls, _ = self.inspect_model_cls(architectures)
model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.supports_transcription
def is_transcription_only_model(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool:
model_cls, _ = self.inspect_model_cls(architectures)
model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.supports_transcription_only
def is_v1_compatible(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool:
model_cls, _ = self.inspect_model_cls(architectures)
model_cls, _ = self.inspect_model_cls(architectures, model_config)
return not model_cls.supports_v0_only
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from typing import Optional, Union
from transformers.dynamic_module_utils import get_class_from_dynamic_module
import vllm.envs as envs
from vllm.logger import init_logger
logger = init_logger(__name__)
def try_get_class_from_dynamic_module(
class_reference: str,
pretrained_model_name_or_path: str,
cache_dir: Optional[Union[str, os.PathLike]] = None,
force_download: bool = False,
resume_download: Optional[bool] = None,
proxies: Optional[dict[str, str]] = None,
token: Optional[Union[bool, str]] = None,
revision: Optional[str] = None,
local_files_only: bool = False,
repo_type: Optional[str] = None,
code_revision: Optional[str] = None,
warn_on_fail: bool = True,
**kwargs,
) -> Optional[type]:
"""
As [transformers.dynamic_module_utils.get_class_from_dynamic_module][],
but ignoring any errors.
"""
try:
return get_class_from_dynamic_module(
class_reference,
pretrained_model_name_or_path,
cache_dir=cache_dir,
force_download=force_download,
resume_download=resume_download,
proxies=proxies,
token=token,
revision=revision,
local_files_only=local_files_only,
repo_type=repo_type,
code_revision=code_revision,
**kwargs,
)
except Exception:
location = "ModelScope" if envs.VLLM_USE_MODELSCOPE else "HF Hub"
if warn_on_fail:
logger.warning(
"Unable to load %s from %s on %s.",
class_reference,
pretrained_model_name_or_path,
location,
exc_info=True,
)
return None
......@@ -3,6 +3,8 @@
from typing import Optional
from typing_extensions import assert_never
from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig
from vllm.lora.request import LoRARequest
from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens,
......@@ -108,6 +110,14 @@ class TokenizerGroup:
def init_tokenizer_from_configs(model_config: ModelConfig,
scheduler_config: SchedulerConfig,
lora_config: Optional[LoRAConfig]):
runner_type = model_config.runner_type
if runner_type == "generate" or runner_type == "draft":
truncation_side = "left"
elif runner_type == "pooling":
truncation_side = "right"
else:
assert_never(runner_type)
return TokenizerGroup(
tokenizer_id=model_config.tokenizer,
enable_lora=bool(lora_config),
......@@ -117,4 +127,4 @@ def init_tokenizer_from_configs(model_config: ModelConfig,
tokenizer_mode=model_config.tokenizer_mode,
trust_remote_code=model_config.trust_remote_code,
revision=model_config.tokenizer_revision,
truncation_side=model_config.truncation_side)
truncation_side=truncation_side)
......@@ -127,8 +127,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self.is_multimodal_model = model_config.is_multimodal_model
self.is_pooling_model = model_config.pooler_config is not None
self.is_encoder_only_model = False
self.model_supports_multimodal_raw_input = (
model_config.model_supports_multimodal_raw_input)
self.is_multimodal_raw_input_supported = (
model_config.is_multimodal_raw_input_supported)
self.max_model_len = model_config.max_model_len
self.max_num_tokens = scheduler_config.max_num_batched_tokens
self.max_num_reqs = scheduler_config.max_num_seqs
......@@ -583,7 +583,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
) -> dict[str, Any]:
model_kwargs: dict[str, Any] = {}
if self.model_supports_multimodal_raw_input:
if self.is_multimodal_raw_input_supported:
# This model requires the raw multimodal data in input.
if scheduler_output:
multi_modal_kwargs_list = []
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment