Unverified Commit 86ae693f authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Deprecation][2/N] Replace `--task` with `--runner` and `--convert` (#21470)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 8f605ee3
......@@ -24,13 +24,8 @@ eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
def _create_proposer(method: str, k: int) -> EagleProposer:
model_config = ModelConfig(model=model_dir,
task="generate",
max_model_len=100,
tokenizer=model_dir,
tokenizer_mode="auto",
dtype="auto",
seed=None,
trust_remote_code=False)
runner="generate",
max_model_len=100)
# Choose model directory based on method
draft_model_dir = eagle_dir if method == "eagle" else eagle3_dir
......
......@@ -44,14 +44,7 @@ def test_ngram_proposer():
def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
# Dummy model config. Just to set max_model_len.
model_config = ModelConfig(model="facebook/opt-125m",
task="generate",
max_model_len=100,
tokenizer="facebook/opt-125m",
tokenizer_mode="auto",
dtype="auto",
seed=None,
trust_remote_code=False)
model_config = ModelConfig(model="facebook/opt-125m")
return NgramProposer(
vllm_config=VllmConfig(model_config=model_config,
speculative_config=SpeculativeConfig.
......
......@@ -26,10 +26,6 @@ def get_vllm_config():
)
model_config = ModelConfig(
model="facebook/opt-125m",
task="generate",
tokenizer="facebook/opt-125m",
tokenizer_mode="auto",
trust_remote_code=True,
dtype="bfloat16", # TPUs typically use bfloat16
seed=42,
)
......
......@@ -76,10 +76,6 @@ def get_vllm_config():
)
model_config = ModelConfig(
model="facebook/opt-125m",
task="generate",
tokenizer="facebook/opt-125m",
tokenizer_mode="auto",
trust_remote_code=True,
dtype="float16",
seed=42,
)
......
This diff is collapsed.
......@@ -22,14 +22,15 @@ from typing_extensions import TypeIs
import vllm.envs as envs
from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
ConfigFormat, ConfigType, DecodingConfig,
DetailedTraceModules, Device, DeviceConfig,
DistributedExecutorBackend, GuidedDecodingBackend,
GuidedDecodingBackendV1, HfOverrides, KVEventsConfig,
KVTransferConfig, LoadConfig, LogprobsMode,
LoRAConfig, ModelConfig, ModelDType, ModelImpl,
MultiModalConfig, ObservabilityConfig, ParallelConfig,
PoolerConfig, PrefixCachingHashAlgo, SchedulerConfig,
ConfigFormat, ConfigType, ConvertOption,
DecodingConfig, DetailedTraceModules, Device,
DeviceConfig, DistributedExecutorBackend,
GuidedDecodingBackend, GuidedDecodingBackendV1,
HfOverrides, KVEventsConfig, KVTransferConfig,
LoadConfig, LogprobsMode, LoRAConfig, ModelConfig,
ModelDType, ModelImpl, MultiModalConfig,
ObservabilityConfig, ParallelConfig, PoolerConfig,
PrefixCachingHashAlgo, RunnerOption, SchedulerConfig,
SchedulerPolicy, SpeculativeConfig, TaskOption,
TokenizerMode, VllmConfig, get_attr_docs, get_field)
from vllm.logger import init_logger
......@@ -270,7 +271,9 @@ class EngineArgs:
str, List[str]]] = ModelConfig.served_model_name
tokenizer: Optional[str] = ModelConfig.tokenizer
hf_config_path: Optional[str] = ModelConfig.hf_config_path
task: TaskOption = ModelConfig.task
runner: RunnerOption = ModelConfig.runner
convert: ConvertOption = ModelConfig.convert
task: Optional[TaskOption] = ModelConfig.task
skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode
......@@ -461,7 +464,11 @@ class EngineArgs:
)
if not ('serve' in sys.argv[1:] and '--help' in sys.argv[1:]):
model_group.add_argument("--model", **model_kwargs["model"])
model_group.add_argument("--task", **model_kwargs["task"])
model_group.add_argument("--runner", **model_kwargs["runner"])
model_group.add_argument("--convert", **model_kwargs["convert"])
model_group.add_argument("--task",
**model_kwargs["task"],
deprecated=True)
model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
model_group.add_argument("--tokenizer-mode",
**model_kwargs["tokenizer_mode"])
......@@ -870,6 +877,8 @@ class EngineArgs:
return ModelConfig(
model=self.model,
hf_config_path=self.hf_config_path,
runner=self.runner,
convert=self.convert,
task=self.task,
tokenizer=self.tokenizer,
tokenizer_mode=self.tokenizer_mode,
......
......@@ -20,8 +20,8 @@ from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
create_sort_beams_key_function)
from vllm.config import (CompilationConfig, ModelDType, TokenizerMode,
is_init_field)
from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
TaskOption)
from vllm.engine.arg_utils import (ConvertOption, EngineArgs, HfOverrides,
PoolerConfig, RunnerOption)
from vllm.engine.llm_engine import LLMEngine
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
ChatTemplateContentFormatOption,
......@@ -170,7 +170,8 @@ class LLM:
self,
model: str,
*,
task: TaskOption = "auto",
runner: RunnerOption = "auto",
convert: ConvertOption = "auto",
tokenizer: Optional[str] = None,
tokenizer_mode: TokenizerMode = "auto",
skip_tokenizer_init: bool = False,
......@@ -244,7 +245,8 @@ class LLM:
engine_args = EngineArgs(
model=model,
task=task,
runner=runner,
convert=convert,
tokenizer=tokenizer,
tokenizer_mode=tokenizer_mode,
skip_tokenizer_init=skip_tokenizer_init,
......@@ -459,18 +461,10 @@ class LLM:
model_config = self.llm_engine.model_config
runner_type = model_config.runner_type
if runner_type != "generate":
messages = [
"LLM.generate() is only supported for generative models."
]
if "generate" in model_config.supported_runner_types:
messages.append(
"Your model supports the 'generate' runner, but is "
f"currently initialized for the '{runner_type}' runner. "
"Please initialize vLLM using `--task generate` or "
"`--task transcription`.")
raise ValueError(" ".join(messages))
raise ValueError(
"LLM.generate() is only supported for generative models. "
"Try passing `--runner generate` to use the model as a "
"generative model.")
if prompt_token_ids is not None:
parsed_prompts = self._convert_v1_inputs(
......@@ -497,7 +491,8 @@ class LLM:
truncate_prompt_tokens = None
if isinstance(sampling_params, SamplingParams):
truncate_prompt_tokens = sampling_params.truncate_prompt_tokens
_validate_truncation_size(self.llm_engine.model_config.max_model_len,
_validate_truncation_size(model_config.max_model_len,
truncate_prompt_tokens, tokenization_kwargs)
# Add any modality specific loras to the corresponding prompts
......@@ -1100,16 +1095,10 @@ class LLM:
model_config = self.llm_engine.model_config
runner_type = model_config.runner_type
if runner_type != "pooling":
messages = ["LLM.encode() is only supported for pooling models."]
if "pooling" in model_config.supported_runner_types:
messages.append(
"Your model supports the 'pooling' runner, but is "
f"currently initialized for the '{runner_type}' runner. "
"Please initialize vLLM using `--task embed`, "
"`--task classify`, `--task score` etc.")
raise ValueError(" ".join(messages))
raise ValueError(
"LLM.encode() is only supported for pooling models. "
"Try passing `--runner pooling` to use the model as a "
"pooling model.")
if prompt_token_ids is not None:
parsed_prompts = self._convert_v1_inputs(
......@@ -1183,8 +1172,9 @@ class LLM:
embedding vectors in the same order as the input prompts.
"""
if "embed" not in self.supported_tasks:
raise ValueError("Embedding API is not supported by this model. "
"Please set `--task embed`.")
raise ValueError(
"Embedding API is not supported by this model. "
"Try converting the model using `--convert embed`.")
items = self.encode(
prompts,
......@@ -1229,7 +1219,7 @@ class LLM:
if "classify" not in self.supported_tasks:
raise ValueError(
"Classification API is not supported by this model. "
"Please set `--task classify`.")
"Try converting the model using `--convert classify`.")
items = self.encode(
prompts,
......@@ -1283,27 +1273,26 @@ class LLM:
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
) -> list[ScoringRequestOutput]:
model_config = self.llm_engine.model_config
if isinstance(tokenizer, MistralTokenizer):
raise ValueError(
"Score API is only enabled for `--task embed or score`")
"Score API is not supported for Mistral tokenizer")
if len(data_1) == 1:
data_1 = data_1 * len(data_2)
pooling_params = PoolingParams(task="score")
tokenization_kwargs: dict[str, Any] = {}
_validate_truncation_size(self.llm_engine.model_config.max_model_len,
_validate_truncation_size(model_config.max_model_len,
truncate_prompt_tokens, tokenization_kwargs)
parsed_prompts = []
input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
if self.llm_engine.model_config.is_multimodal_model:
model_config = self.llm_engine.model_config
if model_config.is_multimodal_model:
for q, d in input_pairs:
_, engine_prompt = get_score_prompt(
model_config=model_config,
......@@ -1314,11 +1303,9 @@ class LLM:
)
parsed_prompts.append(engine_prompt)
else:
for q, t in input_pairs:
if self.llm_engine.model_config.use_pad_token:
if model_config.use_pad_token:
# cross_encoder models defaults to using pad_token.
prompt_inputs = tokenizer(
text=q, # type: ignore[arg-type]
......@@ -1396,23 +1383,18 @@ class LLM:
model_config = self.llm_engine.model_config
runner_type = model_config.runner_type
if runner_type != "pooling":
messages = ["LLM.score() is only supported for pooling models."]
if "pooling" in model_config.supported_runner_types:
messages.append(
"Your model supports the 'pooling' runner, but is "
f"currently initialized for the '{runner_type}' runner. "
"Please initialize vLLM using `--task embed`, "
"`--task classify`, `--task score` etc.")
raise ValueError(" ".join(messages))
raise ValueError(
"LLM.score() is only supported for pooling models. "
"Try passing `--runner pooling` to use the model as a "
"pooling model.")
supported_tasks = self.supported_tasks
if all(t not in supported_tasks for t in ("embed", "classify")):
raise ValueError("Score API is not supported by this model. "
"Please set `--task embed` or `--task classify`.")
"Try converting the model using "
"`--convert embed` or `--convert classify`.")
if (model_config.task == "classify"
if (model_config.is_cross_encoder
and getattr(model_config.hf_config, "num_labels", 0) != 1):
raise ValueError("Score API is only enabled for num_labels == 1.")
......@@ -1421,15 +1403,14 @@ class LLM:
# lists of tokens to the `text` and `text_pair` kwargs
tokenizer = self.get_tokenizer()
if not self.llm_engine.model_config.is_multimodal_model:
if not model_config.is_multimodal_model:
def check_data_type(data: Union[SingletonPrompt,
Sequence[SingletonPrompt],
ScoreMultiModalParam]):
if isinstance(data, dict) and "content" in data:
raise ValueError(
f"ScoreMultiModalParam is not supported for {self.llm_engine.model_config.architecture}", # noqa: E501
)
raise ValueError("ScoreMultiModalParam is not supported "
f"for {model_config.architecture}")
check_data_type(data_1)
check_data_type(data_2)
......@@ -1471,7 +1452,7 @@ class LLM:
_validate_score_input_lens(data_1, data_2) # type: ignore[arg-type]
if self.llm_engine.model_config.is_cross_encoder:
if model_config.is_cross_encoder:
return self._cross_encoding_score(
tokenizer,
data_1, # type: ignore[arg-type]
......
......@@ -1734,7 +1734,6 @@ async def init_app_state(
state.openai_serving_models,
request_logger=request_logger,
) if "transcription" in supported_tasks else None
state.task = model_config.task
state.enable_server_load_tracking = args.enable_server_load_tracking
state.server_load_metrics = 0
......
......@@ -9,9 +9,8 @@ from dataclasses import dataclass, field
from typing import Optional
import torch
import transformers
from torch import nn
from transformers.dynamic_module_utils import get_class_from_dynamic_module
from typing_extensions import assert_never
from vllm.attention import Attention
from vllm.config import (ModelConfig, ModelImpl, VllmConfig,
......@@ -20,13 +19,10 @@ from vllm.logger import init_logger
from vllm.model_executor.layers.linear import QKVCrossParallelLinear
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase)
from vllm.model_executor.models import ModelRegistry
from vllm.model_executor.models.adapters import (as_embedding_model,
as_reward_model,
as_seq_cls_model)
from vllm.model_executor.models.interfaces import SupportsQuant
from vllm.model_executor.models.registry import (_PREVIOUSLY_SUPPORTED_MODELS,
_TRANSFORMERS_BACKEND_MODELS)
from vllm.utils import is_pin_memory_available
logger = init_logger(__name__)
......@@ -169,61 +165,6 @@ def device_loading_context(module: torch.nn.Module,
# New parameters or parameters already on target device are untouched
def resolve_transformers_arch(model_config: ModelConfig,
architectures: list[str]):
if model_config.model_impl == ModelImpl.VLLM:
raise ValueError(
"Attempting to resolve architecture from the Transformers library "
"but the model implementation is set to vLLM. This should never "
"happen.")
for i, arch in enumerate(architectures):
if arch in _TRANSFORMERS_BACKEND_MODELS:
continue
if model_config.model_impl == ModelImpl.AUTO:
logger.warning(
"%s has no vLLM implementation, falling back to Transformers "
"implementation. Some features may not be supported and "
"performance may not be optimal.", arch)
auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
None) or dict()
# Make sure that config class is always initialized before model class,
# otherwise the model class won't be able to access the config class,
# the expected auto_map should have correct order like:
# "auto_map": {
# "AutoConfig": "<your-repo-name>--<config-name>",
# "AutoModel": "<your-repo-name>--<config-name>",
# "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
# },
auto_modules = {
name:
get_class_from_dynamic_module(module,
model_config.model,
revision=model_config.revision)
for name, module in sorted(auto_map.items(), key=lambda x: x[0])
}
model_module = getattr(transformers, arch, None)
if model_module is None:
if "AutoModel" not in auto_map:
raise ValueError(
f"Cannot find model module. '{arch}' is not a registered "
"model in the Transformers library (only relevant if the "
"model is meant to be in Transformers) and 'AutoModel' is "
"not present in the model config's 'auto_map' (relevant "
"if the model is custom).")
model_module = auto_modules["AutoModel"]
if not model_module.is_backend_compatible():
raise ValueError(
f"The Transformers implementation of '{arch}' is not "
"compatible with vLLM.")
architectures[i] = model_config._get_transformers_backend_cls()
return architectures
def get_model_architecture(
model_config: ModelConfig) -> tuple[type[nn.Module], str]:
architectures = getattr(model_config.hf_config, "architectures", [])
......@@ -239,56 +180,38 @@ def get_model_architecture(
"bitsandbytes",
]
vllm_supported_archs = ModelRegistry.get_supported_archs()
is_supported = lambda arch: (arch in vllm_supported_archs and arch not in
_TRANSFORMERS_BACKEND_MODELS)
vllm_not_supported = not any(is_supported(arch) for arch in architectures)
if vllm_not_supported:
# try automatic conversion in adapters.py
for arch in architectures:
if not arch.endswith("ForSequenceClassification"):
continue
assert model_config.task == "classify"
causal_lm_arch = arch.replace("ForSequenceClassification",
"ForCausalLM")
causal_lm_arch_vllm_supported = (causal_lm_arch
in vllm_supported_archs)
if not causal_lm_arch_vllm_supported:
continue
architectures = [causal_lm_arch]
vllm_not_supported = False
break
if any(arch in _PREVIOUSLY_SUPPORTED_MODELS for arch in architectures):
previous_version = _PREVIOUSLY_SUPPORTED_MODELS[architectures[0]]
raise ValueError(
f"Model architecture {architectures[0]} was supported"
f" in vLLM until version {previous_version}, and is "
"not supported anymore. Please use an older version"
" of vLLM if you want to use this model architecture.")
if (model_config.model_impl == ModelImpl.TRANSFORMERS or
model_config.model_impl == ModelImpl.AUTO and vllm_not_supported):
architectures = resolve_transformers_arch(model_config, architectures)
logger.debug_once("Resolve transformers arch %s", str(architectures))
elif (model_config.quantization is not None
if (model_config.quantization is not None
and model_config.quantization not in mixtral_supported
and "MixtralForCausalLM" in architectures):
architectures = ["QuantMixtralForCausalLM"]
model_cls, arch = ModelRegistry.resolve_model_cls(architectures)
if model_config.task == "embed":
logger.debug_once("Automatic conversion using `as_embedding_model`.")
model_cls, arch = model_config.registry.resolve_model_cls(
architectures,
model_config=model_config,
)
if arch == model_config._get_transformers_backend_cls():
assert model_config.model_impl != ModelImpl.VLLM
if model_config.model_impl == ModelImpl.AUTO:
logger.warning_once(
"%s has no vLLM implementation, falling back to Transformers "
"implementation. Some features may not be supported and "
"performance may not be optimal.", arch)
convert_type = model_config.convert_type
if convert_type == "none":
pass
elif convert_type == "embed":
logger.debug_once("Converting to embedding model.")
model_cls = as_embedding_model(model_cls)
elif model_config.task == "classify":
logger.debug_once("Automatic conversion using `as_seq_cls_model`.")
elif convert_type == "classify":
logger.debug_once("Converting to sequence classification model.")
model_cls = as_seq_cls_model(model_cls)
elif model_config.task == "reward":
logger.debug_once("Automatic conversion using `as_reward_model`.")
elif convert_type == "reward":
logger.debug_once("Converting to reward model.")
model_cls = as_reward_model(model_cls)
else:
assert_never(convert_type)
return model_cls, arch
......
......@@ -253,8 +253,10 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
dtype=kv_cache_dtype,
use_mla=model_config.use_mla).page_size_bytes
model_cls = ModelRegistry.resolve_model_cls(
model_config._model_info.architecture)[0]
model_cls, _ = ModelRegistry.resolve_model_cls(
model_config.architecture,
model_config=model_config,
)
# get mamba page size
mamba_page_size = MambaSpec(
......
......@@ -12,19 +12,24 @@ import sys
import tempfile
from abc import ABC, abstractmethod
from collections.abc import Set
from dataclasses import asdict, dataclass, field
from dataclasses import dataclass, field
from functools import lru_cache
from typing import Callable, Optional, TypeVar, Union
import torch.nn as nn
import transformers
from vllm.config import (ModelConfig, ModelImpl, iter_architecture_defaults,
try_match_architecture_defaults)
from vllm.logger import init_logger
from vllm.transformers_utils.dynamic_module import (
try_get_class_from_dynamic_module)
from .interfaces import (has_inner_state, has_noops, is_attention_free,
is_hybrid, supports_cross_encoding,
supports_multimodal, supports_multimodal_raw_input,
supports_pp, supports_transcription, supports_v0_only)
from .interfaces_base import is_text_generation_model
from .interfaces_base import is_pooling_model, is_text_generation_model
logger = init_logger(__name__)
......@@ -311,7 +316,7 @@ class _ModelInfo:
return _ModelInfo(
architecture=model.__name__,
is_text_generation_model=is_text_generation_model(model),
is_pooling_model=True, # Can convert any model into a pooling model
is_pooling_model=is_pooling_model(model),
supports_cross_encoding=supports_cross_encoding(model),
supports_multimodal=supports_multimodal(model),
supports_multimodal_raw_input=supports_multimodal_raw_input(model),
......@@ -465,6 +470,16 @@ class _ModelRegistry:
f"Model architectures {architectures} failed "
"to be inspected. Please check the logs for more details.")
for arch in architectures:
if arch in _PREVIOUSLY_SUPPORTED_MODELS:
previous_version = _PREVIOUSLY_SUPPORTED_MODELS[arch]
raise ValueError(
f"Model architecture {arch} was supported in vLLM until "
f"v{previous_version}, and is not supported anymore. "
"Please use an older version of vLLM if you want to "
"use this model architecture.")
raise ValueError(
f"Model architectures {architectures} are not supported for now. "
f"Supported architectures: {all_supported_archs}")
......@@ -477,66 +492,141 @@ class _ModelRegistry:
return _try_load_model_cls(model_arch, self.models[model_arch])
def _try_inspect_model_cls(self, model_arch: str) -> Optional[_ModelInfo]:
if model_arch in self.models:
if model_arch not in self.models:
return None
return _try_inspect_model_cls(model_arch, self.models[model_arch])
if model_arch.endswith("ForSequenceClassification"):
causal_lm_arch = model_arch.replace("ForSequenceClassification",
"ForCausalLM")
if causal_lm_arch not in self.models:
return None
def _try_resolve_transformers(
self,
architecture: str,
model_config: ModelConfig,
) -> Optional[str]:
if architecture in _TRANSFORMERS_BACKEND_MODELS:
return architecture
auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
None) or dict()
# Make sure that config class is always initialized before model class,
# otherwise the model class won't be able to access the config class,
# the expected auto_map should have correct order like:
# "auto_map": {
# "AutoConfig": "<your-repo-name>--<config-name>",
# "AutoModel": "<your-repo-name>--<config-name>",
# "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
# },
for prefix in ("AutoConfig", "AutoModel"):
for name, module in auto_map.items():
if name.startswith(prefix):
try_get_class_from_dynamic_module(
module,
model_config.model,
revision=model_config.revision,
warn_on_fail=False,
)
info = _try_inspect_model_cls(causal_lm_arch,
self.models[causal_lm_arch])
model_module = getattr(transformers, architecture, None)
info = _ModelInfo(**dict(
asdict(info), **{
"architecture": model_arch,
"supports_cross_encoding": True
}))
return info
if model_module is None:
for name, module in auto_map.items():
if name.startswith("AutoModel"):
model_module = try_get_class_from_dynamic_module(
module,
model_config.model,
revision=model_config.revision,
warn_on_fail=True,
)
if model_module is not None:
break
else:
if model_config.model_impl != ModelImpl.TRANSFORMERS:
return None
raise ValueError(
f"Cannot find model module. {architecture!r} is not a "
"registered model in the Transformers library (only "
"relevant if the model is meant to be in Transformers) "
"and 'AutoModel' is not present in the model config's "
"'auto_map' (relevant if the model is custom).")
if not model_module.is_backend_compatible():
if model_config.model_impl != ModelImpl.TRANSFORMERS:
return None
raise ValueError(
f"The Transformers implementation of {architecture!r} "
"is not compatible with vLLM.")
return model_config._get_transformers_backend_cls()
def _normalize_arch(
self,
architecture: str,
model_config: ModelConfig,
) -> str:
if architecture in self.models:
return architecture
# This may be called in order to resolve runner_type and convert_type
# in the first place, in which case we consider the default match
match = try_match_architecture_defaults(
architecture,
runner_type=getattr(model_config, "runner_type", None),
convert_type=getattr(model_config, "convert_type", None),
)
if match:
suffix, _ = match
# Get the name of the base model to convert
for repl_suffix, _ in iter_architecture_defaults():
base_arch = architecture.replace(suffix, repl_suffix)
if base_arch in self.models:
return base_arch
return architecture
def _normalize_archs(
self,
architectures: Union[str, list[str]],
architectures: list[str],
model_config: ModelConfig,
) -> list[str]:
if isinstance(architectures, str):
architectures = [architectures]
if not architectures:
logger.warning("No model architectures are specified")
# filter out support architectures
normalized_arch = list(
filter(lambda model: model in self.models, architectures))
# try automatic conversion in adapters.py
for arch in architectures:
if not arch.endswith("ForSequenceClassification"):
continue
causal_lm_arch = arch.replace("ForSequenceClassification",
"ForCausalLM")
if causal_lm_arch in self.models:
normalized_arch.append(arch)
# NOTE(Isotr0py): Be careful of architectures' order!
# Make sure Transformers backend architecture is at the end of the
# list, otherwise pooling models automatic conversion will fail!
for arch in normalized_arch:
if arch.startswith("TransformersFor"):
normalized_arch.remove(arch)
normalized_arch.append(arch)
return normalized_arch
return [
self._normalize_arch(arch, model_config) for arch in architectures
]
def inspect_model_cls(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> tuple[_ModelInfo, str]:
architectures = self._normalize_archs(architectures)
if isinstance(architectures, str):
architectures = [architectures]
for arch in architectures:
normalized_archs = self._normalize_archs(architectures, model_config)
# Require transformers impl
if model_config.model_impl == ModelImpl.TRANSFORMERS:
arch = self._try_resolve_transformers(architectures[0],
model_config)
if arch is not None:
model_info = self._try_inspect_model_cls(arch)
if model_info is not None:
return (model_info, arch)
for arch, normalized_arch in zip(architectures, normalized_archs):
model_info = self._try_inspect_model_cls(normalized_arch)
if model_info is not None:
return (model_info, arch)
# Fallback to transformers impl
if model_config.model_impl in (ModelImpl.AUTO, ModelImpl.TRANSFORMERS):
arch = self._try_resolve_transformers(architectures[0],
model_config)
if arch is not None:
model_info = self._try_inspect_model_cls(arch)
if model_info is not None:
return (model_info, arch)
......@@ -546,10 +636,32 @@ class _ModelRegistry:
def resolve_model_cls(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> tuple[type[nn.Module], str]:
architectures = self._normalize_archs(architectures)
if isinstance(architectures, str):
architectures = [architectures]
for arch in architectures:
normalized_archs = self._normalize_archs(architectures, model_config)
# Require transformers impl
if model_config.model_impl == ModelImpl.TRANSFORMERS:
arch = self._try_resolve_transformers(architectures[0],
model_config)
if arch is not None:
model_cls = self._try_load_model_cls(arch)
if model_cls is not None:
return (model_cls, arch)
for arch, normalized_arch in zip(architectures, normalized_archs):
model_cls = self._try_load_model_cls(normalized_arch)
if model_cls is not None:
return (model_cls, arch)
# Fallback to transformers impl
if model_config.model_impl in (ModelImpl.AUTO, ModelImpl.TRANSFORMERS):
arch = self._try_resolve_transformers(architectures[0],
model_config)
if arch is not None:
model_cls = self._try_load_model_cls(arch)
if model_cls is not None:
return (model_cls, arch)
......@@ -559,92 +671,105 @@ class _ModelRegistry:
def is_text_generation_model(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool:
model_cls, _ = self.inspect_model_cls(architectures)
model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.is_text_generation_model
def is_pooling_model(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool:
model_cls, _ = self.inspect_model_cls(architectures)
model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.is_pooling_model
def is_cross_encoder_model(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool:
model_cls, _ = self.inspect_model_cls(architectures)
model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.supports_cross_encoding
def is_multimodal_model(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool:
model_cls, _ = self.inspect_model_cls(architectures)
model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.supports_multimodal
def supports_multimodal_raw_input(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool:
model_cls, _ = self.inspect_model_cls(architectures)
model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.supports_multimodal_raw_input
def is_pp_supported_model(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool:
model_cls, _ = self.inspect_model_cls(architectures)
model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.supports_pp
def model_has_inner_state(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool:
model_cls, _ = self.inspect_model_cls(architectures)
model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.has_inner_state
def is_attention_free_model(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool:
model_cls, _ = self.inspect_model_cls(architectures)
model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.is_attention_free
def is_hybrid_model(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool:
model_cls, _ = self.inspect_model_cls(architectures)
model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.is_hybrid
def is_noops_model(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool:
model_cls, _ = self.inspect_model_cls(architectures)
model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.has_noops
def is_transcription_model(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool:
model_cls, _ = self.inspect_model_cls(architectures)
model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.supports_transcription
def is_transcription_only_model(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool:
model_cls, _ = self.inspect_model_cls(architectures)
model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.supports_transcription_only
def is_v1_compatible(
self,
architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool:
model_cls, _ = self.inspect_model_cls(architectures)
model_cls, _ = self.inspect_model_cls(architectures, model_config)
return not model_cls.supports_v0_only
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from typing import Optional, Union
from transformers.dynamic_module_utils import get_class_from_dynamic_module
import vllm.envs as envs
from vllm.logger import init_logger
logger = init_logger(__name__)
def try_get_class_from_dynamic_module(
class_reference: str,
pretrained_model_name_or_path: str,
cache_dir: Optional[Union[str, os.PathLike]] = None,
force_download: bool = False,
resume_download: Optional[bool] = None,
proxies: Optional[dict[str, str]] = None,
token: Optional[Union[bool, str]] = None,
revision: Optional[str] = None,
local_files_only: bool = False,
repo_type: Optional[str] = None,
code_revision: Optional[str] = None,
warn_on_fail: bool = True,
**kwargs,
) -> Optional[type]:
"""
As [transformers.dynamic_module_utils.get_class_from_dynamic_module][],
but ignoring any errors.
"""
try:
return get_class_from_dynamic_module(
class_reference,
pretrained_model_name_or_path,
cache_dir=cache_dir,
force_download=force_download,
resume_download=resume_download,
proxies=proxies,
token=token,
revision=revision,
local_files_only=local_files_only,
repo_type=repo_type,
code_revision=code_revision,
**kwargs,
)
except Exception:
location = "ModelScope" if envs.VLLM_USE_MODELSCOPE else "HF Hub"
if warn_on_fail:
logger.warning(
"Unable to load %s from %s on %s.",
class_reference,
pretrained_model_name_or_path,
location,
exc_info=True,
)
return None
......@@ -3,6 +3,8 @@
from typing import Optional
from typing_extensions import assert_never
from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig
from vllm.lora.request import LoRARequest
from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens,
......@@ -108,6 +110,14 @@ class TokenizerGroup:
def init_tokenizer_from_configs(model_config: ModelConfig,
scheduler_config: SchedulerConfig,
lora_config: Optional[LoRAConfig]):
runner_type = model_config.runner_type
if runner_type == "generate" or runner_type == "draft":
truncation_side = "left"
elif runner_type == "pooling":
truncation_side = "right"
else:
assert_never(runner_type)
return TokenizerGroup(
tokenizer_id=model_config.tokenizer,
enable_lora=bool(lora_config),
......@@ -117,4 +127,4 @@ def init_tokenizer_from_configs(model_config: ModelConfig,
tokenizer_mode=model_config.tokenizer_mode,
trust_remote_code=model_config.trust_remote_code,
revision=model_config.tokenizer_revision,
truncation_side=model_config.truncation_side)
truncation_side=truncation_side)
......@@ -127,8 +127,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self.is_multimodal_model = model_config.is_multimodal_model
self.is_pooling_model = model_config.pooler_config is not None
self.is_encoder_only_model = False
self.model_supports_multimodal_raw_input = (
model_config.model_supports_multimodal_raw_input)
self.is_multimodal_raw_input_supported = (
model_config.is_multimodal_raw_input_supported)
self.max_model_len = model_config.max_model_len
self.max_num_tokens = scheduler_config.max_num_batched_tokens
self.max_num_reqs = scheduler_config.max_num_seqs
......@@ -583,7 +583,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
) -> dict[str, Any]:
model_kwargs: dict[str, Any] = {}
if self.model_supports_multimodal_raw_input:
if self.is_multimodal_raw_input_supported:
# This model requires the raw multimodal data in input.
if scheduler_output:
multi_modal_kwargs_list = []
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment