"deploy/cloud/helm/vscode:/vscode.git/clone" did not exist on "cfc6178aa324bb31f5bb7ffa65057f5fa59e9e4f"
Unverified Commit 86ae693f authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Deprecation][2/N] Replace `--task` with `--runner` and `--convert` (#21470)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 8f605ee3
...@@ -24,13 +24,8 @@ eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B" ...@@ -24,13 +24,8 @@ eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
def _create_proposer(method: str, k: int) -> EagleProposer: def _create_proposer(method: str, k: int) -> EagleProposer:
model_config = ModelConfig(model=model_dir, model_config = ModelConfig(model=model_dir,
task="generate", runner="generate",
max_model_len=100, max_model_len=100)
tokenizer=model_dir,
tokenizer_mode="auto",
dtype="auto",
seed=None,
trust_remote_code=False)
# Choose model directory based on method # Choose model directory based on method
draft_model_dir = eagle_dir if method == "eagle" else eagle3_dir draft_model_dir = eagle_dir if method == "eagle" else eagle3_dir
......
...@@ -44,14 +44,7 @@ def test_ngram_proposer(): ...@@ -44,14 +44,7 @@ def test_ngram_proposer():
def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
# Dummy model config. Just to set max_model_len. # Dummy model config. Just to set max_model_len.
model_config = ModelConfig(model="facebook/opt-125m", model_config = ModelConfig(model="facebook/opt-125m")
task="generate",
max_model_len=100,
tokenizer="facebook/opt-125m",
tokenizer_mode="auto",
dtype="auto",
seed=None,
trust_remote_code=False)
return NgramProposer( return NgramProposer(
vllm_config=VllmConfig(model_config=model_config, vllm_config=VllmConfig(model_config=model_config,
speculative_config=SpeculativeConfig. speculative_config=SpeculativeConfig.
......
...@@ -26,10 +26,6 @@ def get_vllm_config(): ...@@ -26,10 +26,6 @@ def get_vllm_config():
) )
model_config = ModelConfig( model_config = ModelConfig(
model="facebook/opt-125m", model="facebook/opt-125m",
task="generate",
tokenizer="facebook/opt-125m",
tokenizer_mode="auto",
trust_remote_code=True,
dtype="bfloat16", # TPUs typically use bfloat16 dtype="bfloat16", # TPUs typically use bfloat16
seed=42, seed=42,
) )
......
...@@ -76,10 +76,6 @@ def get_vllm_config(): ...@@ -76,10 +76,6 @@ def get_vllm_config():
) )
model_config = ModelConfig( model_config = ModelConfig(
model="facebook/opt-125m", model="facebook/opt-125m",
task="generate",
tokenizer="facebook/opt-125m",
tokenizer_mode="auto",
trust_remote_code=True,
dtype="float16", dtype="float16",
seed=42, seed=42,
) )
......
This diff is collapsed.
...@@ -22,14 +22,15 @@ from typing_extensions import TypeIs ...@@ -22,14 +22,15 @@ from typing_extensions import TypeIs
import vllm.envs as envs import vllm.envs as envs
from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
ConfigFormat, ConfigType, DecodingConfig, ConfigFormat, ConfigType, ConvertOption,
DetailedTraceModules, Device, DeviceConfig, DecodingConfig, DetailedTraceModules, Device,
DistributedExecutorBackend, GuidedDecodingBackend, DeviceConfig, DistributedExecutorBackend,
GuidedDecodingBackendV1, HfOverrides, KVEventsConfig, GuidedDecodingBackend, GuidedDecodingBackendV1,
KVTransferConfig, LoadConfig, LogprobsMode, HfOverrides, KVEventsConfig, KVTransferConfig,
LoRAConfig, ModelConfig, ModelDType, ModelImpl, LoadConfig, LogprobsMode, LoRAConfig, ModelConfig,
MultiModalConfig, ObservabilityConfig, ParallelConfig, ModelDType, ModelImpl, MultiModalConfig,
PoolerConfig, PrefixCachingHashAlgo, SchedulerConfig, ObservabilityConfig, ParallelConfig, PoolerConfig,
PrefixCachingHashAlgo, RunnerOption, SchedulerConfig,
SchedulerPolicy, SpeculativeConfig, TaskOption, SchedulerPolicy, SpeculativeConfig, TaskOption,
TokenizerMode, VllmConfig, get_attr_docs, get_field) TokenizerMode, VllmConfig, get_attr_docs, get_field)
from vllm.logger import init_logger from vllm.logger import init_logger
...@@ -270,7 +271,9 @@ class EngineArgs: ...@@ -270,7 +271,9 @@ class EngineArgs:
str, List[str]]] = ModelConfig.served_model_name str, List[str]]] = ModelConfig.served_model_name
tokenizer: Optional[str] = ModelConfig.tokenizer tokenizer: Optional[str] = ModelConfig.tokenizer
hf_config_path: Optional[str] = ModelConfig.hf_config_path hf_config_path: Optional[str] = ModelConfig.hf_config_path
task: TaskOption = ModelConfig.task runner: RunnerOption = ModelConfig.runner
convert: ConvertOption = ModelConfig.convert
task: Optional[TaskOption] = ModelConfig.task
skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode
...@@ -461,7 +464,11 @@ class EngineArgs: ...@@ -461,7 +464,11 @@ class EngineArgs:
) )
if not ('serve' in sys.argv[1:] and '--help' in sys.argv[1:]): if not ('serve' in sys.argv[1:] and '--help' in sys.argv[1:]):
model_group.add_argument("--model", **model_kwargs["model"]) model_group.add_argument("--model", **model_kwargs["model"])
model_group.add_argument("--task", **model_kwargs["task"]) model_group.add_argument("--runner", **model_kwargs["runner"])
model_group.add_argument("--convert", **model_kwargs["convert"])
model_group.add_argument("--task",
**model_kwargs["task"],
deprecated=True)
model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"]) model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
model_group.add_argument("--tokenizer-mode", model_group.add_argument("--tokenizer-mode",
**model_kwargs["tokenizer_mode"]) **model_kwargs["tokenizer_mode"])
...@@ -870,6 +877,8 @@ class EngineArgs: ...@@ -870,6 +877,8 @@ class EngineArgs:
return ModelConfig( return ModelConfig(
model=self.model, model=self.model,
hf_config_path=self.hf_config_path, hf_config_path=self.hf_config_path,
runner=self.runner,
convert=self.convert,
task=self.task, task=self.task,
tokenizer=self.tokenizer, tokenizer=self.tokenizer,
tokenizer_mode=self.tokenizer_mode, tokenizer_mode=self.tokenizer_mode,
......
...@@ -20,8 +20,8 @@ from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, ...@@ -20,8 +20,8 @@ from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
create_sort_beams_key_function) create_sort_beams_key_function)
from vllm.config import (CompilationConfig, ModelDType, TokenizerMode, from vllm.config import (CompilationConfig, ModelDType, TokenizerMode,
is_init_field) is_init_field)
from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig, from vllm.engine.arg_utils import (ConvertOption, EngineArgs, HfOverrides,
TaskOption) PoolerConfig, RunnerOption)
from vllm.engine.llm_engine import LLMEngine from vllm.engine.llm_engine import LLMEngine
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
ChatTemplateContentFormatOption, ChatTemplateContentFormatOption,
...@@ -170,7 +170,8 @@ class LLM: ...@@ -170,7 +170,8 @@ class LLM:
self, self,
model: str, model: str,
*, *,
task: TaskOption = "auto", runner: RunnerOption = "auto",
convert: ConvertOption = "auto",
tokenizer: Optional[str] = None, tokenizer: Optional[str] = None,
tokenizer_mode: TokenizerMode = "auto", tokenizer_mode: TokenizerMode = "auto",
skip_tokenizer_init: bool = False, skip_tokenizer_init: bool = False,
...@@ -244,7 +245,8 @@ class LLM: ...@@ -244,7 +245,8 @@ class LLM:
engine_args = EngineArgs( engine_args = EngineArgs(
model=model, model=model,
task=task, runner=runner,
convert=convert,
tokenizer=tokenizer, tokenizer=tokenizer,
tokenizer_mode=tokenizer_mode, tokenizer_mode=tokenizer_mode,
skip_tokenizer_init=skip_tokenizer_init, skip_tokenizer_init=skip_tokenizer_init,
...@@ -459,18 +461,10 @@ class LLM: ...@@ -459,18 +461,10 @@ class LLM:
model_config = self.llm_engine.model_config model_config = self.llm_engine.model_config
runner_type = model_config.runner_type runner_type = model_config.runner_type
if runner_type != "generate": if runner_type != "generate":
messages = [ raise ValueError(
"LLM.generate() is only supported for generative models." "LLM.generate() is only supported for generative models. "
] "Try passing `--runner generate` to use the model as a "
"generative model.")
if "generate" in model_config.supported_runner_types:
messages.append(
"Your model supports the 'generate' runner, but is "
f"currently initialized for the '{runner_type}' runner. "
"Please initialize vLLM using `--task generate` or "
"`--task transcription`.")
raise ValueError(" ".join(messages))
if prompt_token_ids is not None: if prompt_token_ids is not None:
parsed_prompts = self._convert_v1_inputs( parsed_prompts = self._convert_v1_inputs(
...@@ -497,7 +491,8 @@ class LLM: ...@@ -497,7 +491,8 @@ class LLM:
truncate_prompt_tokens = None truncate_prompt_tokens = None
if isinstance(sampling_params, SamplingParams): if isinstance(sampling_params, SamplingParams):
truncate_prompt_tokens = sampling_params.truncate_prompt_tokens truncate_prompt_tokens = sampling_params.truncate_prompt_tokens
_validate_truncation_size(self.llm_engine.model_config.max_model_len,
_validate_truncation_size(model_config.max_model_len,
truncate_prompt_tokens, tokenization_kwargs) truncate_prompt_tokens, tokenization_kwargs)
# Add any modality specific loras to the corresponding prompts # Add any modality specific loras to the corresponding prompts
...@@ -1100,16 +1095,10 @@ class LLM: ...@@ -1100,16 +1095,10 @@ class LLM:
model_config = self.llm_engine.model_config model_config = self.llm_engine.model_config
runner_type = model_config.runner_type runner_type = model_config.runner_type
if runner_type != "pooling": if runner_type != "pooling":
messages = ["LLM.encode() is only supported for pooling models."] raise ValueError(
"LLM.encode() is only supported for pooling models. "
if "pooling" in model_config.supported_runner_types: "Try passing `--runner pooling` to use the model as a "
messages.append( "pooling model.")
"Your model supports the 'pooling' runner, but is "
f"currently initialized for the '{runner_type}' runner. "
"Please initialize vLLM using `--task embed`, "
"`--task classify`, `--task score` etc.")
raise ValueError(" ".join(messages))
if prompt_token_ids is not None: if prompt_token_ids is not None:
parsed_prompts = self._convert_v1_inputs( parsed_prompts = self._convert_v1_inputs(
...@@ -1183,8 +1172,9 @@ class LLM: ...@@ -1183,8 +1172,9 @@ class LLM:
embedding vectors in the same order as the input prompts. embedding vectors in the same order as the input prompts.
""" """
if "embed" not in self.supported_tasks: if "embed" not in self.supported_tasks:
raise ValueError("Embedding API is not supported by this model. " raise ValueError(
"Please set `--task embed`.") "Embedding API is not supported by this model. "
"Try converting the model using `--convert embed`.")
items = self.encode( items = self.encode(
prompts, prompts,
...@@ -1229,7 +1219,7 @@ class LLM: ...@@ -1229,7 +1219,7 @@ class LLM:
if "classify" not in self.supported_tasks: if "classify" not in self.supported_tasks:
raise ValueError( raise ValueError(
"Classification API is not supported by this model. " "Classification API is not supported by this model. "
"Please set `--task classify`.") "Try converting the model using `--convert classify`.")
items = self.encode( items = self.encode(
prompts, prompts,
...@@ -1283,27 +1273,26 @@ class LLM: ...@@ -1283,27 +1273,26 @@ class LLM:
use_tqdm: Union[bool, Callable[..., tqdm]] = True, use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
) -> list[ScoringRequestOutput]: ) -> list[ScoringRequestOutput]:
model_config = self.llm_engine.model_config
if isinstance(tokenizer, MistralTokenizer): if isinstance(tokenizer, MistralTokenizer):
raise ValueError( raise ValueError(
"Score API is only enabled for `--task embed or score`") "Score API is not supported for Mistral tokenizer")
if len(data_1) == 1: if len(data_1) == 1:
data_1 = data_1 * len(data_2) data_1 = data_1 * len(data_2)
pooling_params = PoolingParams(task="score") pooling_params = PoolingParams(task="score")
tokenization_kwargs: dict[str, Any] = {} tokenization_kwargs: dict[str, Any] = {}
_validate_truncation_size(self.llm_engine.model_config.max_model_len,
_validate_truncation_size(model_config.max_model_len,
truncate_prompt_tokens, tokenization_kwargs) truncate_prompt_tokens, tokenization_kwargs)
parsed_prompts = [] parsed_prompts = []
input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)] input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
if self.llm_engine.model_config.is_multimodal_model: if model_config.is_multimodal_model:
model_config = self.llm_engine.model_config
for q, d in input_pairs: for q, d in input_pairs:
_, engine_prompt = get_score_prompt( _, engine_prompt = get_score_prompt(
model_config=model_config, model_config=model_config,
...@@ -1314,11 +1303,9 @@ class LLM: ...@@ -1314,11 +1303,9 @@ class LLM:
) )
parsed_prompts.append(engine_prompt) parsed_prompts.append(engine_prompt)
else: else:
for q, t in input_pairs: for q, t in input_pairs:
if self.llm_engine.model_config.use_pad_token: if model_config.use_pad_token:
# cross_encoder models defaults to using pad_token. # cross_encoder models defaults to using pad_token.
prompt_inputs = tokenizer( prompt_inputs = tokenizer(
text=q, # type: ignore[arg-type] text=q, # type: ignore[arg-type]
...@@ -1396,23 +1383,18 @@ class LLM: ...@@ -1396,23 +1383,18 @@ class LLM:
model_config = self.llm_engine.model_config model_config = self.llm_engine.model_config
runner_type = model_config.runner_type runner_type = model_config.runner_type
if runner_type != "pooling": if runner_type != "pooling":
messages = ["LLM.score() is only supported for pooling models."] raise ValueError(
"LLM.score() is only supported for pooling models. "
if "pooling" in model_config.supported_runner_types: "Try passing `--runner pooling` to use the model as a "
messages.append( "pooling model.")
"Your model supports the 'pooling' runner, but is "
f"currently initialized for the '{runner_type}' runner. "
"Please initialize vLLM using `--task embed`, "
"`--task classify`, `--task score` etc.")
raise ValueError(" ".join(messages))
supported_tasks = self.supported_tasks supported_tasks = self.supported_tasks
if all(t not in supported_tasks for t in ("embed", "classify")): if all(t not in supported_tasks for t in ("embed", "classify")):
raise ValueError("Score API is not supported by this model. " raise ValueError("Score API is not supported by this model. "
"Please set `--task embed` or `--task classify`.") "Try converting the model using "
"`--convert embed` or `--convert classify`.")
if (model_config.task == "classify" if (model_config.is_cross_encoder
and getattr(model_config.hf_config, "num_labels", 0) != 1): and getattr(model_config.hf_config, "num_labels", 0) != 1):
raise ValueError("Score API is only enabled for num_labels == 1.") raise ValueError("Score API is only enabled for num_labels == 1.")
...@@ -1421,15 +1403,14 @@ class LLM: ...@@ -1421,15 +1403,14 @@ class LLM:
# lists of tokens to the `text` and `text_pair` kwargs # lists of tokens to the `text` and `text_pair` kwargs
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
if not self.llm_engine.model_config.is_multimodal_model: if not model_config.is_multimodal_model:
def check_data_type(data: Union[SingletonPrompt, def check_data_type(data: Union[SingletonPrompt,
Sequence[SingletonPrompt], Sequence[SingletonPrompt],
ScoreMultiModalParam]): ScoreMultiModalParam]):
if isinstance(data, dict) and "content" in data: if isinstance(data, dict) and "content" in data:
raise ValueError( raise ValueError("ScoreMultiModalParam is not supported "
f"ScoreMultiModalParam is not supported for {self.llm_engine.model_config.architecture}", # noqa: E501 f"for {model_config.architecture}")
)
check_data_type(data_1) check_data_type(data_1)
check_data_type(data_2) check_data_type(data_2)
...@@ -1471,7 +1452,7 @@ class LLM: ...@@ -1471,7 +1452,7 @@ class LLM:
_validate_score_input_lens(data_1, data_2) # type: ignore[arg-type] _validate_score_input_lens(data_1, data_2) # type: ignore[arg-type]
if self.llm_engine.model_config.is_cross_encoder: if model_config.is_cross_encoder:
return self._cross_encoding_score( return self._cross_encoding_score(
tokenizer, tokenizer,
data_1, # type: ignore[arg-type] data_1, # type: ignore[arg-type]
......
...@@ -1734,7 +1734,6 @@ async def init_app_state( ...@@ -1734,7 +1734,6 @@ async def init_app_state(
state.openai_serving_models, state.openai_serving_models,
request_logger=request_logger, request_logger=request_logger,
) if "transcription" in supported_tasks else None ) if "transcription" in supported_tasks else None
state.task = model_config.task
state.enable_server_load_tracking = args.enable_server_load_tracking state.enable_server_load_tracking = args.enable_server_load_tracking
state.server_load_metrics = 0 state.server_load_metrics = 0
......
...@@ -9,9 +9,8 @@ from dataclasses import dataclass, field ...@@ -9,9 +9,8 @@ from dataclasses import dataclass, field
from typing import Optional from typing import Optional
import torch import torch
import transformers
from torch import nn from torch import nn
from transformers.dynamic_module_utils import get_class_from_dynamic_module from typing_extensions import assert_never
from vllm.attention import Attention from vllm.attention import Attention
from vllm.config import (ModelConfig, ModelImpl, VllmConfig, from vllm.config import (ModelConfig, ModelImpl, VllmConfig,
...@@ -20,13 +19,10 @@ from vllm.logger import init_logger ...@@ -20,13 +19,10 @@ from vllm.logger import init_logger
from vllm.model_executor.layers.linear import QKVCrossParallelLinear from vllm.model_executor.layers.linear import QKVCrossParallelLinear
from vllm.model_executor.layers.quantization.base_config import ( from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase) QuantizationConfig, QuantizeMethodBase)
from vllm.model_executor.models import ModelRegistry
from vllm.model_executor.models.adapters import (as_embedding_model, from vllm.model_executor.models.adapters import (as_embedding_model,
as_reward_model, as_reward_model,
as_seq_cls_model) as_seq_cls_model)
from vllm.model_executor.models.interfaces import SupportsQuant from vllm.model_executor.models.interfaces import SupportsQuant
from vllm.model_executor.models.registry import (_PREVIOUSLY_SUPPORTED_MODELS,
_TRANSFORMERS_BACKEND_MODELS)
from vllm.utils import is_pin_memory_available from vllm.utils import is_pin_memory_available
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -169,61 +165,6 @@ def device_loading_context(module: torch.nn.Module, ...@@ -169,61 +165,6 @@ def device_loading_context(module: torch.nn.Module,
# New parameters or parameters already on target device are untouched # New parameters or parameters already on target device are untouched
def resolve_transformers_arch(model_config: ModelConfig,
architectures: list[str]):
if model_config.model_impl == ModelImpl.VLLM:
raise ValueError(
"Attempting to resolve architecture from the Transformers library "
"but the model implementation is set to vLLM. This should never "
"happen.")
for i, arch in enumerate(architectures):
if arch in _TRANSFORMERS_BACKEND_MODELS:
continue
if model_config.model_impl == ModelImpl.AUTO:
logger.warning(
"%s has no vLLM implementation, falling back to Transformers "
"implementation. Some features may not be supported and "
"performance may not be optimal.", arch)
auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
None) or dict()
# Make sure that config class is always initialized before model class,
# otherwise the model class won't be able to access the config class,
# the expected auto_map should have correct order like:
# "auto_map": {
# "AutoConfig": "<your-repo-name>--<config-name>",
# "AutoModel": "<your-repo-name>--<config-name>",
# "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
# },
auto_modules = {
name:
get_class_from_dynamic_module(module,
model_config.model,
revision=model_config.revision)
for name, module in sorted(auto_map.items(), key=lambda x: x[0])
}
model_module = getattr(transformers, arch, None)
if model_module is None:
if "AutoModel" not in auto_map:
raise ValueError(
f"Cannot find model module. '{arch}' is not a registered "
"model in the Transformers library (only relevant if the "
"model is meant to be in Transformers) and 'AutoModel' is "
"not present in the model config's 'auto_map' (relevant "
"if the model is custom).")
model_module = auto_modules["AutoModel"]
if not model_module.is_backend_compatible():
raise ValueError(
f"The Transformers implementation of '{arch}' is not "
"compatible with vLLM.")
architectures[i] = model_config._get_transformers_backend_cls()
return architectures
def get_model_architecture( def get_model_architecture(
model_config: ModelConfig) -> tuple[type[nn.Module], str]: model_config: ModelConfig) -> tuple[type[nn.Module], str]:
architectures = getattr(model_config.hf_config, "architectures", []) architectures = getattr(model_config.hf_config, "architectures", [])
...@@ -239,56 +180,38 @@ def get_model_architecture( ...@@ -239,56 +180,38 @@ def get_model_architecture(
"bitsandbytes", "bitsandbytes",
] ]
vllm_supported_archs = ModelRegistry.get_supported_archs() if (model_config.quantization is not None
is_supported = lambda arch: (arch in vllm_supported_archs and arch not in and model_config.quantization not in mixtral_supported
_TRANSFORMERS_BACKEND_MODELS) and "MixtralForCausalLM" in architectures):
vllm_not_supported = not any(is_supported(arch) for arch in architectures) architectures = ["QuantMixtralForCausalLM"]
if vllm_not_supported:
# try automatic conversion in adapters.py
for arch in architectures:
if not arch.endswith("ForSequenceClassification"):
continue
assert model_config.task == "classify" model_cls, arch = model_config.registry.resolve_model_cls(
causal_lm_arch = arch.replace("ForSequenceClassification", architectures,
"ForCausalLM") model_config=model_config,
causal_lm_arch_vllm_supported = (causal_lm_arch )
in vllm_supported_archs)
if not causal_lm_arch_vllm_supported:
continue
architectures = [causal_lm_arch] if arch == model_config._get_transformers_backend_cls():
vllm_not_supported = False assert model_config.model_impl != ModelImpl.VLLM
break if model_config.model_impl == ModelImpl.AUTO:
logger.warning_once(
if any(arch in _PREVIOUSLY_SUPPORTED_MODELS for arch in architectures): "%s has no vLLM implementation, falling back to Transformers "
previous_version = _PREVIOUSLY_SUPPORTED_MODELS[architectures[0]] "implementation. Some features may not be supported and "
raise ValueError( "performance may not be optimal.", arch)
f"Model architecture {architectures[0]} was supported"
f" in vLLM until version {previous_version}, and is "
"not supported anymore. Please use an older version"
" of vLLM if you want to use this model architecture.")
if (model_config.model_impl == ModelImpl.TRANSFORMERS or
model_config.model_impl == ModelImpl.AUTO and vllm_not_supported):
architectures = resolve_transformers_arch(model_config, architectures)
logger.debug_once("Resolve transformers arch %s", str(architectures))
elif (model_config.quantization is not None
and model_config.quantization not in mixtral_supported
and "MixtralForCausalLM" in architectures):
architectures = ["QuantMixtralForCausalLM"]
model_cls, arch = ModelRegistry.resolve_model_cls(architectures) convert_type = model_config.convert_type
if model_config.task == "embed": if convert_type == "none":
logger.debug_once("Automatic conversion using `as_embedding_model`.") pass
elif convert_type == "embed":
logger.debug_once("Converting to embedding model.")
model_cls = as_embedding_model(model_cls) model_cls = as_embedding_model(model_cls)
elif model_config.task == "classify": elif convert_type == "classify":
logger.debug_once("Automatic conversion using `as_seq_cls_model`.") logger.debug_once("Converting to sequence classification model.")
model_cls = as_seq_cls_model(model_cls) model_cls = as_seq_cls_model(model_cls)
elif model_config.task == "reward": elif convert_type == "reward":
logger.debug_once("Automatic conversion using `as_reward_model`.") logger.debug_once("Converting to reward model.")
model_cls = as_reward_model(model_cls) model_cls = as_reward_model(model_cls)
else:
assert_never(convert_type)
return model_cls, arch return model_cls, arch
......
...@@ -253,8 +253,10 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): ...@@ -253,8 +253,10 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
dtype=kv_cache_dtype, dtype=kv_cache_dtype,
use_mla=model_config.use_mla).page_size_bytes use_mla=model_config.use_mla).page_size_bytes
model_cls = ModelRegistry.resolve_model_cls( model_cls, _ = ModelRegistry.resolve_model_cls(
model_config._model_info.architecture)[0] model_config.architecture,
model_config=model_config,
)
# get mamba page size # get mamba page size
mamba_page_size = MambaSpec( mamba_page_size = MambaSpec(
......
...@@ -12,19 +12,24 @@ import sys ...@@ -12,19 +12,24 @@ import sys
import tempfile import tempfile
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from collections.abc import Set from collections.abc import Set
from dataclasses import asdict, dataclass, field from dataclasses import dataclass, field
from functools import lru_cache from functools import lru_cache
from typing import Callable, Optional, TypeVar, Union from typing import Callable, Optional, TypeVar, Union
import torch.nn as nn import torch.nn as nn
import transformers
from vllm.config import (ModelConfig, ModelImpl, iter_architecture_defaults,
try_match_architecture_defaults)
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.transformers_utils.dynamic_module import (
try_get_class_from_dynamic_module)
from .interfaces import (has_inner_state, has_noops, is_attention_free, from .interfaces import (has_inner_state, has_noops, is_attention_free,
is_hybrid, supports_cross_encoding, is_hybrid, supports_cross_encoding,
supports_multimodal, supports_multimodal_raw_input, supports_multimodal, supports_multimodal_raw_input,
supports_pp, supports_transcription, supports_v0_only) supports_pp, supports_transcription, supports_v0_only)
from .interfaces_base import is_text_generation_model from .interfaces_base import is_pooling_model, is_text_generation_model
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -311,7 +316,7 @@ class _ModelInfo: ...@@ -311,7 +316,7 @@ class _ModelInfo:
return _ModelInfo( return _ModelInfo(
architecture=model.__name__, architecture=model.__name__,
is_text_generation_model=is_text_generation_model(model), is_text_generation_model=is_text_generation_model(model),
is_pooling_model=True, # Can convert any model into a pooling model is_pooling_model=is_pooling_model(model),
supports_cross_encoding=supports_cross_encoding(model), supports_cross_encoding=supports_cross_encoding(model),
supports_multimodal=supports_multimodal(model), supports_multimodal=supports_multimodal(model),
supports_multimodal_raw_input=supports_multimodal_raw_input(model), supports_multimodal_raw_input=supports_multimodal_raw_input(model),
...@@ -465,6 +470,16 @@ class _ModelRegistry: ...@@ -465,6 +470,16 @@ class _ModelRegistry:
f"Model architectures {architectures} failed " f"Model architectures {architectures} failed "
"to be inspected. Please check the logs for more details.") "to be inspected. Please check the logs for more details.")
for arch in architectures:
if arch in _PREVIOUSLY_SUPPORTED_MODELS:
previous_version = _PREVIOUSLY_SUPPORTED_MODELS[arch]
raise ValueError(
f"Model architecture {arch} was supported in vLLM until "
f"v{previous_version}, and is not supported anymore. "
"Please use an older version of vLLM if you want to "
"use this model architecture.")
raise ValueError( raise ValueError(
f"Model architectures {architectures} are not supported for now. " f"Model architectures {architectures} are not supported for now. "
f"Supported architectures: {all_supported_archs}") f"Supported architectures: {all_supported_archs}")
...@@ -477,174 +492,284 @@ class _ModelRegistry: ...@@ -477,174 +492,284 @@ class _ModelRegistry:
return _try_load_model_cls(model_arch, self.models[model_arch]) return _try_load_model_cls(model_arch, self.models[model_arch])
def _try_inspect_model_cls(self, model_arch: str) -> Optional[_ModelInfo]: def _try_inspect_model_cls(self, model_arch: str) -> Optional[_ModelInfo]:
if model_arch in self.models: if model_arch not in self.models:
return _try_inspect_model_cls(model_arch, self.models[model_arch]) return None
if model_arch.endswith("ForSequenceClassification"): return _try_inspect_model_cls(model_arch, self.models[model_arch])
causal_lm_arch = model_arch.replace("ForSequenceClassification",
"ForCausalLM") def _try_resolve_transformers(
if causal_lm_arch not in self.models: self,
architecture: str,
model_config: ModelConfig,
) -> Optional[str]:
if architecture in _TRANSFORMERS_BACKEND_MODELS:
return architecture
auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
None) or dict()
# Make sure that config class is always initialized before model class,
# otherwise the model class won't be able to access the config class,
# the expected auto_map should have correct order like:
# "auto_map": {
# "AutoConfig": "<your-repo-name>--<config-name>",
# "AutoModel": "<your-repo-name>--<config-name>",
# "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
# },
for prefix in ("AutoConfig", "AutoModel"):
for name, module in auto_map.items():
if name.startswith(prefix):
try_get_class_from_dynamic_module(
module,
model_config.model,
revision=model_config.revision,
warn_on_fail=False,
)
model_module = getattr(transformers, architecture, None)
if model_module is None:
for name, module in auto_map.items():
if name.startswith("AutoModel"):
model_module = try_get_class_from_dynamic_module(
module,
model_config.model,
revision=model_config.revision,
warn_on_fail=True,
)
if model_module is not None:
break
else:
if model_config.model_impl != ModelImpl.TRANSFORMERS:
return None
raise ValueError(
f"Cannot find model module. {architecture!r} is not a "
"registered model in the Transformers library (only "
"relevant if the model is meant to be in Transformers) "
"and 'AutoModel' is not present in the model config's "
"'auto_map' (relevant if the model is custom).")
if not model_module.is_backend_compatible():
if model_config.model_impl != ModelImpl.TRANSFORMERS:
return None return None
info = _try_inspect_model_cls(causal_lm_arch, raise ValueError(
self.models[causal_lm_arch]) f"The Transformers implementation of {architecture!r} "
"is not compatible with vLLM.")
info = _ModelInfo(**dict( return model_config._get_transformers_backend_cls()
asdict(info), **{
"architecture": model_arch,
"supports_cross_encoding": True
}))
return info
return None def _normalize_arch(
self,
architecture: str,
model_config: ModelConfig,
) -> str:
if architecture in self.models:
return architecture
# This may be called in order to resolve runner_type and convert_type
# in the first place, in which case we consider the default match
match = try_match_architecture_defaults(
architecture,
runner_type=getattr(model_config, "runner_type", None),
convert_type=getattr(model_config, "convert_type", None),
)
if match:
suffix, _ = match
# Get the name of the base model to convert
for repl_suffix, _ in iter_architecture_defaults():
base_arch = architecture.replace(suffix, repl_suffix)
if base_arch in self.models:
return base_arch
return architecture
def _normalize_archs( def _normalize_archs(
self, self,
architectures: Union[str, list[str]], architectures: list[str],
model_config: ModelConfig,
) -> list[str]: ) -> list[str]:
if isinstance(architectures, str):
architectures = [architectures]
if not architectures: if not architectures:
logger.warning("No model architectures are specified") logger.warning("No model architectures are specified")
# filter out support architectures return [
normalized_arch = list( self._normalize_arch(arch, model_config) for arch in architectures
filter(lambda model: model in self.models, architectures)) ]
# try automatic conversion in adapters.py
for arch in architectures:
if not arch.endswith("ForSequenceClassification"):
continue
causal_lm_arch = arch.replace("ForSequenceClassification",
"ForCausalLM")
if causal_lm_arch in self.models:
normalized_arch.append(arch)
# NOTE(Isotr0py): Be careful of architectures' order!
# Make sure Transformers backend architecture is at the end of the
# list, otherwise pooling models automatic conversion will fail!
for arch in normalized_arch:
if arch.startswith("TransformersFor"):
normalized_arch.remove(arch)
normalized_arch.append(arch)
return normalized_arch
def inspect_model_cls( def inspect_model_cls(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> tuple[_ModelInfo, str]: ) -> tuple[_ModelInfo, str]:
architectures = self._normalize_archs(architectures) if isinstance(architectures, str):
architectures = [architectures]
for arch in architectures: normalized_archs = self._normalize_archs(architectures, model_config)
model_info = self._try_inspect_model_cls(arch)
# Require transformers impl
if model_config.model_impl == ModelImpl.TRANSFORMERS:
arch = self._try_resolve_transformers(architectures[0],
model_config)
if arch is not None:
model_info = self._try_inspect_model_cls(arch)
if model_info is not None:
return (model_info, arch)
for arch, normalized_arch in zip(architectures, normalized_archs):
model_info = self._try_inspect_model_cls(normalized_arch)
if model_info is not None: if model_info is not None:
return (model_info, arch) return (model_info, arch)
# Fallback to transformers impl
if model_config.model_impl in (ModelImpl.AUTO, ModelImpl.TRANSFORMERS):
arch = self._try_resolve_transformers(architectures[0],
model_config)
if arch is not None:
model_info = self._try_inspect_model_cls(arch)
if model_info is not None:
return (model_info, arch)
return self._raise_for_unsupported(architectures) return self._raise_for_unsupported(architectures)
def resolve_model_cls( def resolve_model_cls(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> tuple[type[nn.Module], str]: ) -> tuple[type[nn.Module], str]:
architectures = self._normalize_archs(architectures) if isinstance(architectures, str):
architectures = [architectures]
for arch in architectures: normalized_archs = self._normalize_archs(architectures, model_config)
model_cls = self._try_load_model_cls(arch)
# Require transformers impl
if model_config.model_impl == ModelImpl.TRANSFORMERS:
arch = self._try_resolve_transformers(architectures[0],
model_config)
if arch is not None:
model_cls = self._try_load_model_cls(arch)
if model_cls is not None:
return (model_cls, arch)
for arch, normalized_arch in zip(architectures, normalized_archs):
model_cls = self._try_load_model_cls(normalized_arch)
if model_cls is not None: if model_cls is not None:
return (model_cls, arch) return (model_cls, arch)
# Fallback to transformers impl
if model_config.model_impl in (ModelImpl.AUTO, ModelImpl.TRANSFORMERS):
arch = self._try_resolve_transformers(architectures[0],
model_config)
if arch is not None:
model_cls = self._try_load_model_cls(arch)
if model_cls is not None:
return (model_cls, arch)
return self._raise_for_unsupported(architectures) return self._raise_for_unsupported(architectures)
def is_text_generation_model( def is_text_generation_model(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool: ) -> bool:
model_cls, _ = self.inspect_model_cls(architectures) model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.is_text_generation_model return model_cls.is_text_generation_model
def is_pooling_model( def is_pooling_model(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool: ) -> bool:
model_cls, _ = self.inspect_model_cls(architectures) model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.is_pooling_model return model_cls.is_pooling_model
def is_cross_encoder_model( def is_cross_encoder_model(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool: ) -> bool:
model_cls, _ = self.inspect_model_cls(architectures) model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.supports_cross_encoding return model_cls.supports_cross_encoding
def is_multimodal_model( def is_multimodal_model(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool: ) -> bool:
model_cls, _ = self.inspect_model_cls(architectures) model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.supports_multimodal return model_cls.supports_multimodal
def supports_multimodal_raw_input( def supports_multimodal_raw_input(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool: ) -> bool:
model_cls, _ = self.inspect_model_cls(architectures) model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.supports_multimodal_raw_input return model_cls.supports_multimodal_raw_input
def is_pp_supported_model( def is_pp_supported_model(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool: ) -> bool:
model_cls, _ = self.inspect_model_cls(architectures) model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.supports_pp return model_cls.supports_pp
def model_has_inner_state( def model_has_inner_state(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool: ) -> bool:
model_cls, _ = self.inspect_model_cls(architectures) model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.has_inner_state return model_cls.has_inner_state
def is_attention_free_model( def is_attention_free_model(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool: ) -> bool:
model_cls, _ = self.inspect_model_cls(architectures) model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.is_attention_free return model_cls.is_attention_free
def is_hybrid_model( def is_hybrid_model(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool: ) -> bool:
model_cls, _ = self.inspect_model_cls(architectures) model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.is_hybrid return model_cls.is_hybrid
def is_noops_model( def is_noops_model(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool: ) -> bool:
model_cls, _ = self.inspect_model_cls(architectures) model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.has_noops return model_cls.has_noops
def is_transcription_model( def is_transcription_model(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool: ) -> bool:
model_cls, _ = self.inspect_model_cls(architectures) model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.supports_transcription return model_cls.supports_transcription
def is_transcription_only_model( def is_transcription_only_model(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool: ) -> bool:
model_cls, _ = self.inspect_model_cls(architectures) model_cls, _ = self.inspect_model_cls(architectures, model_config)
return model_cls.supports_transcription_only return model_cls.supports_transcription_only
def is_v1_compatible( def is_v1_compatible(
self, self,
architectures: Union[str, list[str]], architectures: Union[str, list[str]],
model_config: ModelConfig,
) -> bool: ) -> bool:
model_cls, _ = self.inspect_model_cls(architectures) model_cls, _ = self.inspect_model_cls(architectures, model_config)
return not model_cls.supports_v0_only return not model_cls.supports_v0_only
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from typing import Optional, Union
from transformers.dynamic_module_utils import get_class_from_dynamic_module
import vllm.envs as envs
from vllm.logger import init_logger
logger = init_logger(__name__)
def try_get_class_from_dynamic_module(
class_reference: str,
pretrained_model_name_or_path: str,
cache_dir: Optional[Union[str, os.PathLike]] = None,
force_download: bool = False,
resume_download: Optional[bool] = None,
proxies: Optional[dict[str, str]] = None,
token: Optional[Union[bool, str]] = None,
revision: Optional[str] = None,
local_files_only: bool = False,
repo_type: Optional[str] = None,
code_revision: Optional[str] = None,
warn_on_fail: bool = True,
**kwargs,
) -> Optional[type]:
"""
As [transformers.dynamic_module_utils.get_class_from_dynamic_module][],
but ignoring any errors.
"""
try:
return get_class_from_dynamic_module(
class_reference,
pretrained_model_name_or_path,
cache_dir=cache_dir,
force_download=force_download,
resume_download=resume_download,
proxies=proxies,
token=token,
revision=revision,
local_files_only=local_files_only,
repo_type=repo_type,
code_revision=code_revision,
**kwargs,
)
except Exception:
location = "ModelScope" if envs.VLLM_USE_MODELSCOPE else "HF Hub"
if warn_on_fail:
logger.warning(
"Unable to load %s from %s on %s.",
class_reference,
pretrained_model_name_or_path,
location,
exc_info=True,
)
return None
...@@ -3,6 +3,8 @@ ...@@ -3,6 +3,8 @@
from typing import Optional from typing import Optional
from typing_extensions import assert_never
from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens, from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens,
...@@ -108,6 +110,14 @@ class TokenizerGroup: ...@@ -108,6 +110,14 @@ class TokenizerGroup:
def init_tokenizer_from_configs(model_config: ModelConfig, def init_tokenizer_from_configs(model_config: ModelConfig,
scheduler_config: SchedulerConfig, scheduler_config: SchedulerConfig,
lora_config: Optional[LoRAConfig]): lora_config: Optional[LoRAConfig]):
runner_type = model_config.runner_type
if runner_type == "generate" or runner_type == "draft":
truncation_side = "left"
elif runner_type == "pooling":
truncation_side = "right"
else:
assert_never(runner_type)
return TokenizerGroup( return TokenizerGroup(
tokenizer_id=model_config.tokenizer, tokenizer_id=model_config.tokenizer,
enable_lora=bool(lora_config), enable_lora=bool(lora_config),
...@@ -117,4 +127,4 @@ def init_tokenizer_from_configs(model_config: ModelConfig, ...@@ -117,4 +127,4 @@ def init_tokenizer_from_configs(model_config: ModelConfig,
tokenizer_mode=model_config.tokenizer_mode, tokenizer_mode=model_config.tokenizer_mode,
trust_remote_code=model_config.trust_remote_code, trust_remote_code=model_config.trust_remote_code,
revision=model_config.tokenizer_revision, revision=model_config.tokenizer_revision,
truncation_side=model_config.truncation_side) truncation_side=truncation_side)
...@@ -127,8 +127,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -127,8 +127,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self.is_multimodal_model = model_config.is_multimodal_model self.is_multimodal_model = model_config.is_multimodal_model
self.is_pooling_model = model_config.pooler_config is not None self.is_pooling_model = model_config.pooler_config is not None
self.is_encoder_only_model = False self.is_encoder_only_model = False
self.model_supports_multimodal_raw_input = ( self.is_multimodal_raw_input_supported = (
model_config.model_supports_multimodal_raw_input) model_config.is_multimodal_raw_input_supported)
self.max_model_len = model_config.max_model_len self.max_model_len = model_config.max_model_len
self.max_num_tokens = scheduler_config.max_num_batched_tokens self.max_num_tokens = scheduler_config.max_num_batched_tokens
self.max_num_reqs = scheduler_config.max_num_seqs self.max_num_reqs = scheduler_config.max_num_seqs
...@@ -583,7 +583,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -583,7 +583,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
) -> dict[str, Any]: ) -> dict[str, Any]:
model_kwargs: dict[str, Any] = {} model_kwargs: dict[str, Any] = {}
if self.model_supports_multimodal_raw_input: if self.is_multimodal_raw_input_supported:
# This model requires the raw multimodal data in input. # This model requires the raw multimodal data in input.
if scheduler_output: if scheduler_output:
multi_modal_kwargs_list = [] multi_modal_kwargs_list = []
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment