Commit 4851c202 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.1' into v0.6.1-dev

parents 9b902f9e 3fd2b0d2
...@@ -9,6 +9,7 @@ from .audio import AudioPlugin ...@@ -9,6 +9,7 @@ from .audio import AudioPlugin
from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalInputs, from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalInputs,
MultiModalPlugin, MultiModalTokensCalc, NestedTensors) MultiModalPlugin, MultiModalTokensCalc, NestedTensors)
from .image import ImagePlugin from .image import ImagePlugin
from .video import VideoPlugin
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -34,7 +35,7 @@ class MultiModalRegistry: ...@@ -34,7 +35,7 @@ class MultiModalRegistry:
:class:`~vllm.multimodal.MultiModalPlugin` for each modality. :class:`~vllm.multimodal.MultiModalPlugin` for each modality.
""" """
DEFAULT_PLUGINS = (ImagePlugin(), AudioPlugin()) DEFAULT_PLUGINS = (ImagePlugin(), AudioPlugin(), VideoPlugin())
def __init__( def __init__(
self, self,
......
...@@ -4,6 +4,7 @@ from io import BytesIO ...@@ -4,6 +4,7 @@ from io import BytesIO
from typing import Any, List, Optional, Tuple, TypeVar, Union from typing import Any, List, Optional, Tuple, TypeVar, Union
import numpy as np import numpy as np
import numpy.typing as npt
from PIL import Image from PIL import Image
from vllm.connections import global_http_connection from vllm.connections import global_http_connection
...@@ -187,6 +188,47 @@ def rescale_image_size(image: Image.Image, ...@@ -187,6 +188,47 @@ def rescale_image_size(image: Image.Image,
return image return image
def try_import_video_packages() -> Any:
try:
import cv2
except ImportError:
raise ImportError(
"Please install vllm[video] for video support.") from None
return cv2
def resize_video(frames: npt.NDArray, size: Tuple[int, int]) -> npt.NDArray:
cv2 = try_import_video_packages()
num_frames, _, _, channels = frames.shape
new_height, new_width = size
resized_frames = np.empty((num_frames, new_height, new_width, channels),
dtype=frames.dtype)
for i, frame in enumerate(frames):
resized_frame = cv2.resize(frame, (new_width, new_height))
resized_frames[i] = resized_frame
return resized_frames
def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
_, height, width, _ = frames.shape
new_height = int(height * size_factor)
new_width = int(width * size_factor)
return resize_video(frames, (new_height, new_width))
def sample_frames_from_video(frames: npt.NDArray,
num_frames: int) -> npt.NDArray:
total_frames = frames.shape[0]
if num_frames == -1:
return frames
else:
frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
sampled_frames = frames[frame_indices, ...]
return sampled_frames
# Utilities for input processors # Utilities for input processors
_T = TypeVar("_T", str, int) _T = TypeVar("_T", str, int)
......
from functools import lru_cache
from typing import List, Union
import numpy as np
from vllm.config import ModelConfig
from vllm.inputs.registry import InputContext
from vllm.logger import init_logger
from vllm.transformers_utils.image_processor import get_video_processor
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.utils import is_list_of
from .base import MultiModalData, MultiModalInputs
from .image import ImagePlugin
logger = init_logger(__name__)
cached_get_video_processor = lru_cache(get_video_processor)
cached_get_tokenizer = lru_cache(get_tokenizer)
VideoInput = Union[
"np.ndarray", # single video input
List["np.ndarray"],
# TODO: support more types
# List[Image.Image], List[List[Image.Image]],
# "torch.Tensor",
# List["torch.Tensor"],
# List[List["np.ndarrray"]],
# List[List["torch.Tensor"]],
]
class VideoPlugin(ImagePlugin):
"""Plugin for video data."""
def get_data_key(self) -> str:
return "video"
def _get_hf_video_processor(self, model_config: ModelConfig):
return cached_get_video_processor(
model_config.model,
trust_remote_code=model_config.trust_remote_code)
def _default_input_mapper(
self,
ctx: InputContext,
data: MultiModalData[object],
) -> MultiModalInputs:
model_config = ctx.model_config
# single video input as np.ndarray
if isinstance(data, np.ndarray):
video_processor = self._get_hf_video_processor(model_config)
if video_processor is None:
raise RuntimeError("No HuggingFace processor is available "
"to process the image object")
try:
batch_data = video_processor(data, return_tensors="pt").data
except Exception:
logger.error("Failed to process image (%s)", data)
raise
return MultiModalInputs(batch_data)
elif is_list_of(data, np.ndarray):
raise NotImplementedError(
"Multi video for a prompt is not supported yet")
raise TypeError(f"Invalid video type: {type(data)}")
def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
return 4096
...@@ -42,6 +42,13 @@ try: ...@@ -42,6 +42,13 @@ try:
except Exception: except Exception:
pass pass
is_cpu = False
try:
from importlib.metadata import version
is_cpu = "cpu" in version("vllm")
except Exception:
pass
if is_tpu: if is_tpu:
# people might install pytorch built with cuda but run on tpu # people might install pytorch built with cuda but run on tpu
# so we need to check tpu first # so we need to check tpu first
...@@ -53,6 +60,9 @@ elif is_cuda: ...@@ -53,6 +60,9 @@ elif is_cuda:
elif is_rocm: elif is_rocm:
from .rocm import RocmPlatform from .rocm import RocmPlatform
current_platform = RocmPlatform() current_platform = RocmPlatform()
elif is_cpu:
from .cpu import CpuPlatform
current_platform = CpuPlatform()
else: else:
current_platform = UnspecifiedPlatform() current_platform = UnspecifiedPlatform()
......
import torch
from .interface import Platform, PlatformEnum
class CpuPlatform(Platform):
_enum = PlatformEnum.CPU
@staticmethod
def get_device_name(device_id: int = 0) -> str:
return "cpu"
@staticmethod
def inference_mode():
return torch.no_grad()
import enum import enum
from typing import Tuple from typing import Optional, Tuple
import torch import torch
...@@ -8,6 +8,7 @@ class PlatformEnum(enum.Enum): ...@@ -8,6 +8,7 @@ class PlatformEnum(enum.Enum):
CUDA = enum.auto() CUDA = enum.auto()
ROCM = enum.auto() ROCM = enum.auto()
TPU = enum.auto() TPU = enum.auto()
CPU = enum.auto()
UNSPECIFIED = enum.auto() UNSPECIFIED = enum.auto()
...@@ -23,9 +24,12 @@ class Platform: ...@@ -23,9 +24,12 @@ class Platform:
def is_tpu(self) -> bool: def is_tpu(self) -> bool:
return self._enum == PlatformEnum.TPU return self._enum == PlatformEnum.TPU
def is_cpu(self) -> bool:
return self._enum == PlatformEnum.CPU
@staticmethod @staticmethod
def get_device_capability(device_id: int = 0) -> Tuple[int, int]: def get_device_capability(device_id: int = 0) -> Optional[Tuple[int, int]]:
raise NotImplementedError return None
@staticmethod @staticmethod
def get_device_name(device_id: int = 0) -> str: def get_device_name(device_id: int = 0) -> str:
......
from typing import Tuple
import torch import torch
from .interface import Platform, PlatformEnum from .interface import Platform, PlatformEnum
...@@ -8,10 +6,6 @@ from .interface import Platform, PlatformEnum ...@@ -8,10 +6,6 @@ from .interface import Platform, PlatformEnum
class TpuPlatform(Platform): class TpuPlatform(Platform):
_enum = PlatformEnum.TPU _enum = PlatformEnum.TPU
@staticmethod
def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
raise RuntimeError("TPU does not have device capability.")
@staticmethod @staticmethod
def inference_mode(): def inference_mode():
return torch.no_grad() return torch.no_grad()
...@@ -14,6 +14,7 @@ from vllm.config import PromptAdapterConfig ...@@ -14,6 +14,7 @@ from vllm.config import PromptAdapterConfig
from vllm.prompt_adapter.layers import ( from vllm.prompt_adapter.layers import (
VocabParallelEmbeddingWithPromptAdapter) # yapf: disable VocabParallelEmbeddingWithPromptAdapter) # yapf: disable
from vllm.prompt_adapter.layers import PromptAdapterMapping from vllm.prompt_adapter.layers import PromptAdapterMapping
from vllm.prompt_adapter.utils import load_peft_weights
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -90,7 +91,6 @@ class PromptAdapterModel(AdapterModel): ...@@ -90,7 +91,6 @@ class PromptAdapterModel(AdapterModel):
config: PromptAdapterConfig, config: PromptAdapterConfig,
device: str = "cuda", device: str = "cuda",
) -> "PromptAdapterModel": ) -> "PromptAdapterModel":
from peft.utils import load_peft_weights
if num_virtual_tokens > config.max_prompt_adapter_token: if num_virtual_tokens > config.max_prompt_adapter_token:
raise ValueError( raise ValueError(
......
# code borrowed from: https://github.com/huggingface/peft/blob/v0.12.0/src/peft/utils/save_and_load.py#L420
import os
from typing import Optional
import torch
from huggingface_hub import file_exists, hf_hub_download
from huggingface_hub.utils import EntryNotFoundError
from safetensors.torch import load_file as safe_load_file
WEIGHTS_NAME = "adapter_model.bin"
SAFETENSORS_WEIGHTS_NAME = "adapter_model.safetensors"
# Get current device name based on available devices
def infer_device() -> str:
if torch.cuda.is_available():
return "cuda"
return "cpu"
def load_peft_weights(model_id: str,
device: Optional[str] = None,
**hf_hub_download_kwargs) -> dict:
r"""
A helper method to load the PEFT weights from the HuggingFace Hub or locally
Args:
model_id (`str`):
The local path to the adapter weights or the name of the adapter to
load from the HuggingFace Hub.
device (`str`):
The device to load the weights onto.
hf_hub_download_kwargs (`dict`):
Additional arguments to pass to the `hf_hub_download` method when
loading from the HuggingFace Hub.
"""
path = (os.path.join(model_id, hf_hub_download_kwargs["subfolder"])
if hf_hub_download_kwargs.get("subfolder", None) is not None else
model_id)
if device is None:
device = infer_device()
if os.path.exists(os.path.join(path, SAFETENSORS_WEIGHTS_NAME)):
filename = os.path.join(path, SAFETENSORS_WEIGHTS_NAME)
use_safetensors = True
elif os.path.exists(os.path.join(path, WEIGHTS_NAME)):
filename = os.path.join(path, WEIGHTS_NAME)
use_safetensors = False
else:
token = hf_hub_download_kwargs.get("token", None)
if token is None:
token = hf_hub_download_kwargs.get("use_auth_token", None)
hub_filename = (os.path.join(hf_hub_download_kwargs["subfolder"],
SAFETENSORS_WEIGHTS_NAME)
if hf_hub_download_kwargs.get("subfolder", None)
is not None else SAFETENSORS_WEIGHTS_NAME)
has_remote_safetensors_file = file_exists(
repo_id=model_id,
filename=hub_filename,
revision=hf_hub_download_kwargs.get("revision", None),
repo_type=hf_hub_download_kwargs.get("repo_type", None),
token=token,
)
use_safetensors = has_remote_safetensors_file
if has_remote_safetensors_file:
# Priority 1: load safetensors weights
filename = hf_hub_download(
model_id,
SAFETENSORS_WEIGHTS_NAME,
**hf_hub_download_kwargs,
)
else:
try:
filename = hf_hub_download(model_id, WEIGHTS_NAME,
**hf_hub_download_kwargs)
except EntryNotFoundError:
raise ValueError( # noqa: B904
f"Can't find weights for {model_id} in {model_id} or \
in the Hugging Face Hub. "
f"Please check that the file {WEIGHTS_NAME} or \
{SAFETENSORS_WEIGHTS_NAME} is present at {model_id}.")
if use_safetensors:
adapters_weights = safe_load_file(filename, device=device)
else:
adapters_weights = torch.load(filename,
map_location=torch.device(device))
return adapters_weights
...@@ -165,6 +165,9 @@ class SequenceData(msgspec.Struct, ...@@ -165,6 +165,9 @@ class SequenceData(msgspec.Struct,
# is called. # is called.
_new_appended_tokens: List[int] = msgspec.field(default_factory=list) _new_appended_tokens: List[int] = msgspec.field(default_factory=list)
# It is used to compute mrope_position_ids.
_mrope_position_delta: Optional[int] = None
def __post_init__(self) -> None: def __post_init__(self) -> None:
assert self._prompt_token_ids.typecode == "l" assert self._prompt_token_ids.typecode == "l"
assert self._output_token_ids.typecode == "l" assert self._output_token_ids.typecode == "l"
...@@ -219,6 +222,14 @@ class SequenceData(msgspec.Struct, ...@@ -219,6 +222,14 @@ class SequenceData(msgspec.Struct,
assert isinstance(self._output_token_ids, array) assert isinstance(self._output_token_ids, array)
return self._output_token_ids return self._output_token_ids
@property
def mrope_position_delta(self) -> Optional[int]:
return self._mrope_position_delta
@mrope_position_delta.setter
def mrope_position_delta(self, new_mrope_position_delta):
self._mrope_position_delta = new_mrope_position_delta
def append_token_id(self, token_id: int, logprob: float) -> None: def append_token_id(self, token_id: int, logprob: float) -> None:
self._output_token_ids.append(token_id) self._output_token_ids.append(token_id)
self._new_appended_tokens.append(token_id) self._new_appended_tokens.append(token_id)
......
...@@ -2,7 +2,6 @@ from typing import List, Optional ...@@ -2,7 +2,6 @@ from typing import List, Optional
import torch import torch
from vllm import _custom_ops as ops
from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.layers.sampler import SamplerOutput
try: try:
...@@ -116,18 +115,9 @@ class TP1DraftModelRunner(ModelRunner): ...@@ -116,18 +115,9 @@ class TP1DraftModelRunner(ModelRunner):
# Update attn_metadata # Update attn_metadata
attn_metadata = model_input.attn_metadata attn_metadata = model_input.attn_metadata
assert isinstance(attn_metadata, FlashAttentionMetadata) assert isinstance(attn_metadata, FlashAttentionMetadata)
attn_metadata.advance_step(num_seqs, num_queries)
attn_metadata.advance_step(model_input, sampled_token_ids,
# Update GPU tensors self.block_size, num_seqs, num_queries)
ops.advance_step(num_seqs=num_seqs,
num_queries=num_queries,
block_size=self.block_size,
input_tokens=model_input.input_tokens,
sampled_token_ids=sampled_token_ids,
input_positions=model_input.input_positions,
seq_lens=attn_metadata.seq_lens_tensor,
slot_mapping=attn_metadata.slot_mapping,
block_tables=attn_metadata.block_tables)
# Update sampling_metadata # Update sampling_metadata
sampling_metadata = model_input.sampling_metadata sampling_metadata = model_input.sampling_metadata
......
import contextlib import contextlib
import enum
import json
from pathlib import Path from pathlib import Path
from typing import Any, Dict, Optional, Type, Union from typing import Any, Dict, Optional, Type, Union
from huggingface_hub import file_exists, hf_hub_download
from transformers import GenerationConfig, PretrainedConfig from transformers import GenerationConfig, PretrainedConfig
from transformers.models.auto.image_processing_auto import ( from transformers.models.auto.image_processing_auto import (
get_image_processor_config) get_image_processor_config)
from transformers.models.auto.modeling_auto import ( from transformers.models.auto.modeling_auto import (
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES) MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
from vllm.envs import VLLM_USE_MODELSCOPE from vllm.envs import VLLM_USE_MODELSCOPE
from vllm.logger import init_logger from vllm.logger import init_logger
# yapf conflicts with isort for this block
# yapf: disable
from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig, from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
EAGLEConfig, ExaoneConfig, EAGLEConfig, ExaoneConfig,
InternVLChatConfig, JAISConfig, GraniteConfig, InternVLChatConfig,
MedusaConfig, MLPSpeculatorConfig, JAISConfig, MedusaConfig,
MPTConfig, NemotronConfig, MLPSpeculatorConfig, MPTConfig,
RWConfig, UltravoxConfig) NemotronConfig, RWConfig,
UltravoxConfig)
# yapf: enable
from vllm.transformers_utils.utils import check_gguf_file from vllm.transformers_utils.utils import check_gguf_file
if VLLM_USE_MODELSCOPE: if VLLM_USE_MODELSCOPE:
...@@ -23,6 +31,8 @@ if VLLM_USE_MODELSCOPE: ...@@ -23,6 +31,8 @@ if VLLM_USE_MODELSCOPE:
else: else:
from transformers import AutoConfig from transformers import AutoConfig
MISTRAL_CONFIG_NAME = "params.json"
logger = init_logger(__name__) logger = init_logger(__name__)
_CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
...@@ -39,6 +49,9 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { ...@@ -39,6 +49,9 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
"internvl_chat": InternVLChatConfig, "internvl_chat": InternVLChatConfig,
"nemotron": NemotronConfig, "nemotron": NemotronConfig,
"ultravox": UltravoxConfig, "ultravox": UltravoxConfig,
# Granite can be removed from here once we have upgraded to
# transformers 4.45+
"granite": GraniteConfig,
} }
for name, cls in _CONFIG_REGISTRY.items(): for name, cls in _CONFIG_REGISTRY.items():
...@@ -46,6 +59,20 @@ for name, cls in _CONFIG_REGISTRY.items(): ...@@ -46,6 +59,20 @@ for name, cls in _CONFIG_REGISTRY.items():
AutoConfig.register(name, cls) AutoConfig.register(name, cls)
class ConfigFormat(str, enum.Enum):
AUTO = "auto"
HF = "hf"
MISTRAL = "mistral"
def file_or_path_exists(model: Union[str, Path], config_name, revision,
token) -> bool:
if Path(model).exists():
return (Path(model) / config_name).is_file()
return file_exists(model, config_name, revision=revision, token=token)
def get_config( def get_config(
model: Union[str, Path], model: Union[str, Path],
trust_remote_code: bool, trust_remote_code: bool,
...@@ -53,38 +80,68 @@ def get_config( ...@@ -53,38 +80,68 @@ def get_config(
code_revision: Optional[str] = None, code_revision: Optional[str] = None,
rope_scaling: Optional[dict] = None, rope_scaling: Optional[dict] = None,
rope_theta: Optional[float] = None, rope_theta: Optional[float] = None,
config_format: ConfigFormat = ConfigFormat.AUTO,
**kwargs, **kwargs,
) -> PretrainedConfig: ) -> PretrainedConfig:
# Separate model folder from file path for GGUF models # Separate model folder from file path for GGUF models
is_gguf = check_gguf_file(model) is_gguf = check_gguf_file(model)
if is_gguf: if is_gguf:
kwargs["gguf_file"] = Path(model).name kwargs["gguf_file"] = Path(model).name
model = Path(model).parent model = Path(model).parent
try: if config_format == ConfigFormat.AUTO:
config = AutoConfig.from_pretrained( if is_gguf or file_or_path_exists(model,
model, HF_CONFIG_NAME,
trust_remote_code=trust_remote_code, revision=revision,
revision=revision, token=kwargs.get("token")):
code_revision=code_revision, config_format = ConfigFormat.HF
**kwargs) elif file_or_path_exists(model,
except ValueError as e: MISTRAL_CONFIG_NAME,
if (not trust_remote_code and revision=revision,
"requires you to execute the configuration file" in str(e)): token=kwargs.get("token")):
err_msg = ( config_format = ConfigFormat.MISTRAL
"Failed to load the model config. If the model is a custom " else:
"model not yet available in the HuggingFace transformers " raise ValueError(f"No supported config format found in {model}")
"library, consider setting `trust_remote_code=True` in LLM "
"or using the `--trust-remote-code` flag in the CLI.") if config_format == ConfigFormat.HF:
raise RuntimeError(err_msg) from e config_dict, _ = PretrainedConfig.get_config_dict(
model, revision=revision, code_revision=code_revision, **kwargs)
# Use custom model class if it's in our registry
model_type = config_dict.get("model_type")
if model_type in _CONFIG_REGISTRY:
config_class = _CONFIG_REGISTRY[model_type]
config = config_class.from_pretrained(model,
revision=revision,
code_revision=code_revision)
else: else:
raise e try:
if config.model_type in _CONFIG_REGISTRY: config = AutoConfig.from_pretrained(
config_class = _CONFIG_REGISTRY[config.model_type] model,
config = config_class.from_pretrained(model, trust_remote_code=trust_remote_code,
revision=revision, revision=revision,
code_revision=code_revision) code_revision=code_revision,
**kwargs,
)
except ValueError as e:
if (not trust_remote_code
and "requires you to execute the configuration file"
in str(e)):
err_msg = (
"Failed to load the model config. If the model "
"is a custom model not yet available in the "
"HuggingFace transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI.")
raise RuntimeError(err_msg) from e
else:
raise e
elif config_format == ConfigFormat.MISTRAL:
config = load_params_config(model, revision)
else:
raise ValueError(f"Unsupported config format: {config_format}")
# Special architecture mapping check for GGUF models # Special architecture mapping check for GGUF models
if is_gguf: if is_gguf:
...@@ -94,16 +151,81 @@ def get_config( ...@@ -94,16 +151,81 @@ def get_config(
model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type] model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type]
config.update({"architectures": [model_type]}) config.update({"architectures": [model_type]})
for key, value in [("rope_scaling", rope_scaling), for key, value in [
("rope_theta", rope_theta)]: ("rope_scaling", rope_scaling),
("rope_theta", rope_theta),
]:
if value is not None: if value is not None:
logger.info("Updating %s from %r to %r", key, logger.info(
getattr(config, key, None), value) "Updating %s from %r to %r",
key,
getattr(config, key, None),
value,
)
config.update({key: value}) config.update({key: value})
return config return config
def load_params_config(model, revision) -> PretrainedConfig:
# This function loads a params.json config which
# should be used when loading models in mistral format
config_file_name = "params.json"
config_path = Path(model) / config_file_name
if not config_path.is_file():
config_path = Path(
hf_hub_download(model, config_file_name, revision=revision))
with open(config_path, "r") as file:
config_dict = json.load(file)
config_mapping = {
"dim": "hidden_size",
"norm_eps": "rms_norm_eps",
"n_kv_heads": "num_key_value_heads",
"n_layers": "num_hidden_layers",
"n_heads": "num_attention_heads",
"hidden_dim": "intermediate_size",
}
def recurse_elems(elem: Any):
if isinstance(elem, dict):
config_dict = {}
for key, value in elem.items():
key = config_mapping.get(key, key)
config_dict[key] = recurse_elems(value)
return PretrainedConfig(**config_dict)
else:
return elem
config_dict["model_type"] = config_dict.get("model_type", "transformer")
config_dict["hidden_act"] = config_dict.get("activation", "silu")
config_dict["tie_word_embeddings"] = config_dict.get(
"tie_embeddings", False)
config_dict["max_seq_len"] = config_dict.get("max_seq_len", 128_000)
if config_dict.get("moe") is not None:
config_dict["architectures"] = ["MixtralForCausalLM"]
else:
config_dict["architectures"] = ["MistralForCausalLM"]
if config_dict.get("vision_encoder") is not None:
multimodal_config = config_dict.pop("vision_encoder")
config_dict = {
"text_config": config_dict,
"vision_config": multimodal_config
}
config_dict["architectures"] = ["PixtralForConditionalGeneration"]
config_dict["model_type"] = "pixtral"
config = recurse_elems(config_dict)
return config
def get_hf_image_processor_config( def get_hf_image_processor_config(
model: Union[str, Path], model: Union[str, Path],
revision: Optional[str] = None, revision: Optional[str] = None,
...@@ -120,7 +242,7 @@ def get_hf_image_processor_config( ...@@ -120,7 +242,7 @@ def get_hf_image_processor_config(
def get_hf_text_config(config: PretrainedConfig): def get_hf_text_config(config: PretrainedConfig):
"""Get the "sub" config relevant to llm for multi modal models. """Get the "sub" config relevant to llm for multi modal models.
No op for pure text models. No op for pure text models.
""" """
if hasattr(config, "text_config"): if hasattr(config, "text_config"):
# The code operates under the assumption that text_config should have # The code operates under the assumption that text_config should have
......
...@@ -6,6 +6,7 @@ from vllm.transformers_utils.configs.exaone import ExaoneConfig ...@@ -6,6 +6,7 @@ from vllm.transformers_utils.configs.exaone import ExaoneConfig
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
# `FalconConfig` class from the official HuggingFace transformers library. # `FalconConfig` class from the official HuggingFace transformers library.
from vllm.transformers_utils.configs.falcon import RWConfig from vllm.transformers_utils.configs.falcon import RWConfig
from vllm.transformers_utils.configs.granite import GraniteConfig
from vllm.transformers_utils.configs.internvl import InternVLChatConfig from vllm.transformers_utils.configs.internvl import InternVLChatConfig
from vllm.transformers_utils.configs.jais import JAISConfig from vllm.transformers_utils.configs.jais import JAISConfig
from vllm.transformers_utils.configs.medusa import MedusaConfig from vllm.transformers_utils.configs.medusa import MedusaConfig
...@@ -27,4 +28,7 @@ __all__ = [ ...@@ -27,4 +28,7 @@ __all__ = [
"MLPSpeculatorConfig", "MLPSpeculatorConfig",
"NemotronConfig", "NemotronConfig",
"UltravoxConfig", "UltravoxConfig",
# Granite can be removed from here once we have upgraded to
# transformers 4.45+
"GraniteConfig",
] ]
from typing import cast from typing import cast
def get_video_processor(
processor_name: str,
trust_remote_code: bool = False,
):
"""
Gets a processor for the given model name via HuggingFace.
"""
from transformers import AutoProcessor
try:
processor = AutoProcessor.from_pretrained(processor_name)
video_processor = processor.video_processor
except ValueError as e:
if not trust_remote_code:
err_msg = (
"Failed to load the processor. If the processor is "
"a custom processor not yet available in the HuggingFace "
"transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI.")
raise RuntimeError(err_msg) from e
else:
raise e
return video_processor
def get_image_processor( def get_image_processor(
processor_name: str, processor_name: str,
*args, *args,
......
from typing import cast
def get_processor(
processor_name: str,
*args,
trust_remote_code: bool = False,
**kwargs,
):
"""Gets a processor for the given model name via HuggingFace."""
# don't put this import at the top level
# it will call torch.cuda.device_count()
from transformers import AutoProcessor
from transformers.processing_utils import ProcessorMixin
try:
processor = AutoProcessor.from_pretrained(
processor_name,
*args,
trust_remote_code=trust_remote_code,
**kwargs)
except ValueError as e:
# If the error pertains to the processor class not existing or not
# currently being imported, suggest using the --trust-remote-code flag.
# Unlike AutoTokenizer, AutoProcessor does not separate such errors
if not trust_remote_code:
err_msg = (
"Failed to load the processor. If the processor is "
"a custom processor not yet available in the HuggingFace "
"transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI.")
raise RuntimeError(err_msg) from e
else:
raise e
return cast(ProcessorMixin, processor)
...@@ -16,7 +16,7 @@ from mistral_common.tokens.tokenizers.tekken import (SpecialTokenPolicy, ...@@ -16,7 +16,7 @@ from mistral_common.tokens.tokenizers.tekken import (SpecialTokenPolicy,
Tekkenizer) Tekkenizer)
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.entrypoints.chat_utils import ConversationMessage from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
@dataclass @dataclass
...@@ -45,26 +45,25 @@ class MistralTokenizer: ...@@ -45,26 +45,25 @@ class MistralTokenizer:
def __init__(self, tokenizer: PublicMistralTokenizer) -> None: def __init__(self, tokenizer: PublicMistralTokenizer) -> None:
self.mistral = tokenizer self.mistral = tokenizer
self.instruct = tokenizer.instruct_tokenizer self.instruct = tokenizer.instruct_tokenizer
self.tokenizer = tokenizer.instruct_tokenizer.tokenizer
self.vocab_size = len(self.tokenizer.vocab()) tokenizer_ = tokenizer.instruct_tokenizer.tokenizer
if isinstance(tokenizer_, Tekkenizer):
assert isinstance(self.tokenizer,
(Tekkenizer, SentencePieceTokenizer)), type(
self.tokenizer)
if (is_tekken := isinstance(self.tokenizer, Tekkenizer)):
# Make sure special tokens will not raise # Make sure special tokens will not raise
self.tokenizer.special_token_policy = SpecialTokenPolicy.IGNORE tokenizer_.special_token_policy = SpecialTokenPolicy.IGNORE
self._is_tekken = is_tekken self._vocab = {
token: idx
for idx, token in enumerate(tokenizer_.vocab())
}
elif isinstance(tokenizer_, SentencePieceTokenizer):
self._vocab = {
token: idx
for idx, token in enumerate(tokenizer_.vocab())
}
else:
raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}")
# the following attributes are set to fit VLLM's design self.tokenizer = tokenizer_
self.is_fast = True
self.chat_template = True
self.all_special_ids: List[Any] = []
self.all_special_tokens: List[Any] = []
self.all_special_tokens_extended: List[Any] = []
@classmethod @classmethod
def from_pretrained(cls, def from_pretrained(cls,
...@@ -102,6 +101,38 @@ class MistralTokenizer: ...@@ -102,6 +101,38 @@ class MistralTokenizer:
revision=revision) revision=revision)
return tokenizer_file return tokenizer_file
# the following attributes are set to fit VLLM's design
@property
def all_special_tokens_extended(self) -> List[str]:
return []
@property
def all_special_tokens(self) -> List[str]:
return []
@property
def all_special_ids(self) -> List[int]:
return []
@property
def bos_token_id(self) -> int:
return self.tokenizer.bos_id
@property
def eos_token_id(self) -> int:
return self.tokenizer.eos_id
@property
def is_fast(self) -> bool:
return True
@property
def vocab_size(self) -> int:
return len(self._vocab)
def __len__(self) -> int:
return self.vocab_size
def __call__( def __call__(
self, self,
prompt: str, prompt: str,
...@@ -117,31 +148,34 @@ class MistralTokenizer: ...@@ -117,31 +148,34 @@ class MistralTokenizer:
return Encoding(input_ids=input_ids) return Encoding(input_ids=input_ids)
def get_added_vocab(self) -> List[str]: def get_vocab(self) -> Dict[str, int]:
return self._vocab
def get_added_vocab(self) -> Dict[str, int]:
# Mistral tokenizers have no added vocabulary # Mistral tokenizers have no added vocabulary
return [] return {}
def encode(self, prompt: str) -> List[int]: def encode(self, prompt: str) -> List[int]:
# `encode ` should only be used for prompt completion # `encode` should only be used for prompt completion
# it should never be used for chat_completion. # it should never be used for chat_completion.
# For chat completion use `apply_chat_template` # For chat completion use `apply_chat_template`
return self.tokenizer.encode(prompt, bos=True, eos=False) return self.tokenizer.encode(prompt, bos=True, eos=False)
def apply_chat_template(self, def apply_chat_template(self,
conversation: List["ConversationMessage"], messages: List["ChatCompletionMessageParam"],
tools: Optional[Dict[str, Any]] = None, tools: Optional[Dict[str, Any]] = None,
**kwargs) -> List[int]: **kwargs) -> List[int]:
assert tools is None, "`tools` are not yet supported." assert tools is None, "`tools` are not yet supported."
request = ChatCompletionRequest( request = ChatCompletionRequest(
messages=conversation) # type: ignore[type-var] messages=messages) # type: ignore[type-var]
encoded = self.mistral.encode_chat_completion(request) encoded = self.mistral.encode_chat_completion(request)
# encode-decode to get clean prompt # encode-decode to get clean prompt
return encoded.tokens return encoded.tokens
def convert_tokens_to_string(self, tokens: List[str]) -> str: def convert_tokens_to_string(self, tokens: List[str]) -> str:
if self._is_tekken: if isinstance(self.tokenizer, Tekkenizer):
return "".join(tokens) return "".join(tokens)
else: else:
return self.tokenizer.decode(tokens) # type: ignore[arg-type] return self.tokenizer.decode(tokens) # type: ignore[arg-type]
...@@ -151,14 +185,11 @@ class MistralTokenizer: ...@@ -151,14 +185,11 @@ class MistralTokenizer:
ids = [ids] ids = [ids]
return self.tokenizer.decode(ids) return self.tokenizer.decode(ids)
@property
def eos_token_id(self):
return self.tokenizer.eos_id
def convert_ids_to_tokens( def convert_ids_to_tokens(
self, self,
ids: List[int], ids: List[int],
skip_special_tokens: Optional[bool] = True) -> List[str]: skip_special_tokens: bool = True,
) -> List[str]:
# TODO(Patrick) - potentially allow special tokens to not be skipped # TODO(Patrick) - potentially allow special tokens to not be skipped
assert ( assert (
skip_special_tokens skip_special_tokens
...@@ -170,6 +201,3 @@ class MistralTokenizer: ...@@ -170,6 +201,3 @@ class MistralTokenizer:
tokens = [self.tokenizer.id_to_piece(id) for id in ids] tokens = [self.tokenizer.id_to_piece(id) for id in ids]
return tokens return tokens
def __len__(self):
return self.vocab_size
...@@ -1224,3 +1224,28 @@ async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args, ...@@ -1224,3 +1224,28 @@ async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args,
def supports_dynamo() -> bool: def supports_dynamo() -> bool:
base_torch_version = Version(Version(torch.__version__).base_version) base_torch_version = Version(Version(torch.__version__).base_version)
return base_torch_version >= Version("2.4.0") return base_torch_version >= Version("2.4.0")
class AtomicCounter:
"""An atomic, thread-safe counter"""
def __init__(self, initial=0):
"""Initialize a new atomic counter to given initial value"""
self._value = initial
self._lock = threading.Lock()
def inc(self, num=1):
"""Atomically increment the counter by num and return the new value"""
with self._lock:
self._value += num
return self._value
def dec(self, num=1):
"""Atomically decrement the counter by num and return the new value"""
with self._lock:
self._value -= num
return self._value
@property
def value(self):
return self._value
...@@ -9,4 +9,4 @@ except Exception as e: ...@@ -9,4 +9,4 @@ except Exception as e:
stacklevel=2) stacklevel=2)
__commit__ = "COMMIT_HASH_PLACEHOLDER" __commit__ = "COMMIT_HASH_PLACEHOLDER"
__version__ = "0.6.0" __version__ = "0.6.1"
...@@ -207,7 +207,8 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): ...@@ -207,7 +207,8 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
def init_device(self) -> None: def init_device(self) -> None:
if self.local_omp_cpuid != "all": if self.local_omp_cpuid != "all":
torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid) ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
logger.info(ret)
self.init_distributed_environment() self.init_distributed_environment()
# Set random seed. # Set random seed.
......
...@@ -30,6 +30,7 @@ from vllm.lora.layers import LoRAMapping ...@@ -30,6 +30,7 @@ from vllm.lora.layers import LoRAMapping
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
from vllm.model_executor import SamplingMetadata, SamplingMetadataCache from vllm.model_executor import SamplingMetadata, SamplingMetadataCache
from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.model_executor.model_loader import get_model from vllm.model_executor.model_loader import get_model
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
...@@ -74,6 +75,10 @@ _NUM_WARMUP_ITERS = 2 ...@@ -74,6 +75,10 @@ _NUM_WARMUP_ITERS = 2
TModelInputForGPU = TypeVar('TModelInputForGPU', bound="ModelInputForGPU") TModelInputForGPU = TypeVar('TModelInputForGPU', bound="ModelInputForGPU")
# For now, bump up cache limits for recompilations during CUDA graph warmups.
torch._dynamo.config.cache_size_limit = 128
torch._dynamo.config.accumulated_cache_size_limit = 128
@dataclass(frozen=True) @dataclass(frozen=True)
class ModelInputForGPU(ModelRunnerInputBase): class ModelInputForGPU(ModelRunnerInputBase):
...@@ -181,6 +186,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): ...@@ -181,6 +186,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
def simple_reinit(self): def simple_reinit(self):
self.input_tokens[0].clear() # type: ignore self.input_tokens[0].clear() # type: ignore
self.input_positions[0].clear() # type: ignore self.input_positions[0].clear() # type: ignore
self.mrope_input_positions = None # type: ignore
self.seq_lens[0] = 0 # type: ignore self.seq_lens[0] = 0 # type: ignore
self.orig_seq_lens[0] = 0 # type: ignore self.orig_seq_lens[0] = 0 # type: ignore
self.query_lens[0] = 0 # type: ignore self.query_lens[0] = 0 # type: ignore
...@@ -206,6 +212,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): ...@@ -206,6 +212,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
# Input tokens and positions. # Input tokens and positions.
input_tokens: Optional[List[List[int]]] = None, input_tokens: Optional[List[List[int]]] = None,
input_positions: Optional[List[List[int]]] = None, input_positions: Optional[List[List[int]]] = None,
mrope_input_positions: Optional[List[List[List[int]]]] = None,
# The sequence length (may be capped to the sliding window). # The sequence length (may be capped to the sliding window).
seq_lens: Optional[List[int]] = None, seq_lens: Optional[List[int]] = None,
...@@ -266,6 +273,8 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): ...@@ -266,6 +273,8 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
for seq_id in range(len(self.seq_ids)): for seq_id in range(len(self.seq_ids)):
self.input_positions[seq_id].clear() self.input_positions[seq_id].clear()
self.mrope_input_positions = None
if seq_lens: if seq_lens:
self.seq_lens = seq_lens self.seq_lens = seq_lens
else: else:
...@@ -327,6 +336,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): ...@@ -327,6 +336,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
else: else:
self.input_tokens = input_tokens or [] self.input_tokens = input_tokens or []
self.input_positions = input_positions or [] self.input_positions = input_positions or []
self.mrope_input_positions = mrope_input_positions or None
self.seq_lens = seq_lens or [] self.seq_lens = seq_lens or []
self.orig_seq_lens = orig_seq_lens or [] self.orig_seq_lens = orig_seq_lens or []
self.query_lens = query_lens or [] self.query_lens = query_lens or []
...@@ -357,6 +367,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): ...@@ -357,6 +367,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
self.input_tokens = [[] for _ in range(self.n_seqs)] self.input_tokens = [[] for _ in range(self.n_seqs)]
self.input_positions = [[] for _ in range(self.n_seqs)] self.input_positions = [[] for _ in range(self.n_seqs)]
self.mrope_input_positions = None
self.seq_lens = [0] * self.n_seqs self.seq_lens = [0] * self.n_seqs
self.orig_seq_lens = [0] * self.n_seqs self.orig_seq_lens = [0] * self.n_seqs
self.query_lens = [0] * self.n_seqs self.query_lens = [0] * self.n_seqs
...@@ -493,6 +504,17 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): ...@@ -493,6 +504,17 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
inter_data.query_lens[ inter_data.query_lens[
seq_idx] = seq_len - context_len if inter_data.is_prompt else 1 seq_idx] = seq_len - context_len if inter_data.is_prompt else 1
if seq_data.mrope_position_delta is not None:
if inter_data.mrope_input_positions is None:
inter_data.mrope_input_positions = [None] * inter_data.n_seqs
inter_data.mrope_input_positions[
seq_idx] = MRotaryEmbedding.get_next_input_positions(
seq_data.mrope_position_delta,
context_len,
seq_len,
)
def _compute_for_prefix_cache_hit( def _compute_for_prefix_cache_hit(
self, inter_data: InterDataForSeqGroup, seq_idx: int, self, inter_data: InterDataForSeqGroup, seq_idx: int,
seq_group_metadata: SequenceGroupMetadata): seq_group_metadata: SequenceGroupMetadata):
...@@ -636,6 +658,40 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): ...@@ -636,6 +658,40 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
mm_kwargs = self.multi_modal_input_mapper(mm_data) mm_kwargs = self.multi_modal_input_mapper(mm_data)
inter_data.multi_modal_inputs = mm_kwargs inter_data.multi_modal_inputs = mm_kwargs
# special processing for mrope position deltas.
if self.runner.model_is_mrope:
image_grid_thw = mm_kwargs.get("image_grid_thw", None)
video_grid_thw = mm_kwargs.get("video_grid_thw", None)
assert image_grid_thw is not None or video_grid_thw is not None, (
"mrope embedding type requires multi-modal input mapper "
"returns 'image_grid_thw' or 'video_grid_thw'.")
hf_config = self.runner.model_config.hf_config
inter_data.mrope_input_positions = [None] * inter_data.n_seqs
for seq_idx in range(inter_data.n_seqs):
seq_data = seq_group_metadata.seq_data[
inter_data.seq_ids[seq_idx]]
token_ids = seq_data.get_token_ids()
mrope_input_positions, mrope_position_delta = \
MRotaryEmbedding.get_input_positions(
token_ids,
image_grid_thw=image_grid_thw,
video_grid_thw=video_grid_thw,
image_token_id=hf_config.image_token_id,
video_token_id=hf_config.video_token_id,
vision_start_token_id=hf_config.vision_start_token_id,
vision_end_token_id=hf_config.vision_end_token_id,
spatial_merge_size=hf_config.vision_config.
spatial_merge_size,
context_len=inter_data.context_lens[seq_idx],
)
seq_data.mrope_position_delta = mrope_position_delta
inter_data.mrope_input_positions[
seq_idx] = mrope_input_positions
def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata):
"""Add a sequence group to the builder.""" """Add a sequence group to the builder."""
seq_ids = seq_group_metadata.seq_data.keys() seq_ids = seq_group_metadata.seq_data.keys()
...@@ -684,10 +740,27 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): ...@@ -684,10 +740,27 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
# prefix caching and there is no decode request. # prefix caching and there is no decode request.
return self.model_input_cls() return self.model_input_cls()
input_positions = [] mrope_input_positions: Optional[List[List[int]]] = None
for inter_data in self.inter_data_list: if any(inter_data.mrope_input_positions is not None
for cur_input_positions in inter_data.input_positions: for inter_data in self.inter_data_list):
input_positions.extend(cur_input_positions) mrope_input_positions = [[] for _ in range(3)]
for idx in range(3):
for inter_data in self.inter_data_list:
msections = inter_data.mrope_input_positions
if msections is None:
for _seq_input_positions in inter_data.input_positions:
mrope_input_positions[idx].extend(
_seq_input_positions)
else:
for _seq_mrope_input_positions in msections:
mrope_input_positions[idx].extend(
_seq_mrope_input_positions[idx])
input_positions = None
else:
input_positions = []
for inter_data in self.inter_data_list:
for cur_input_positions in inter_data.input_positions:
input_positions.extend(cur_input_positions)
seq_lens = [] seq_lens = []
max_decode_seq_len = 0 max_decode_seq_len = 0
...@@ -724,15 +797,24 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): ...@@ -724,15 +797,24 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
# Tokens and positions. # Tokens and positions.
if cuda_graph_pad_size: if cuda_graph_pad_size:
input_tokens.extend(itertools.repeat(0, cuda_graph_pad_size)) input_tokens.extend(itertools.repeat(0, cuda_graph_pad_size))
input_positions.extend(itertools.repeat(0, cuda_graph_pad_size))
assert self.runner.device is not None assert self.runner.device is not None
input_tokens_tensor = async_tensor_h2d(input_tokens, torch.long, input_tokens_tensor = async_tensor_h2d(input_tokens, torch.long,
self.runner.device, self.runner.device,
self.runner.pin_memory) self.runner.pin_memory)
input_positions_tensor = async_tensor_h2d(input_positions, torch.long, if mrope_input_positions is not None:
self.runner.device, for idx in range(3):
self.runner.pin_memory) mrope_input_positions[idx].extend(
itertools.repeat(0, cuda_graph_pad_size))
input_positions_tensor = async_tensor_h2d(mrope_input_positions,
torch.long,
self.runner.device,
self.runner.pin_memory)
else:
input_positions.extend(itertools.repeat(0, cuda_graph_pad_size))
input_positions_tensor = async_tensor_h2d(input_positions,
torch.long,
self.runner.device,
self.runner.pin_memory)
# Sequence and query lengths. # Sequence and query lengths.
if cuda_graph_pad_size: if cuda_graph_pad_size:
seq_lens.extend(itertools.repeat(1, cuda_graph_pad_size)) seq_lens.extend(itertools.repeat(1, cuda_graph_pad_size))
...@@ -982,9 +1064,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): ...@@ -982,9 +1064,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
"This may lead to less accurate results!") "This may lead to less accurate results!")
if envs.VLLM_TEST_DYNAMO_GRAPH_CAPTURE and supports_dynamo(): if envs.VLLM_TEST_DYNAMO_GRAPH_CAPTURE and supports_dynamo():
self.model = torch.compile(self.model, self.model = torch.compile(
fullgraph=True, self.model,
backend="eager") fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
backend="eager")
def save_sharded_state( def save_sharded_state(
self, self,
...@@ -1226,6 +1309,15 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): ...@@ -1226,6 +1309,15 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
raise RuntimeError("PromptAdapter is not enabled.") raise RuntimeError("PromptAdapter is not enabled.")
return self.prompt_adapter_manager.list_adapters() return self.prompt_adapter_manager.list_adapters()
@property
def model_is_mrope(self) -> bool:
"""Detect if the model has "mrope" rope_scaling type.
mrope requires keep "rope_deltas" between prompt and decoding phases."""
rope_scaling = getattr(self.model_config.hf_config, "rope_scaling", {})
if rope_scaling is None:
return False
return rope_scaling.get("type", None) == "mrope"
@torch.inference_mode() @torch.inference_mode()
def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
"""Cuda graph capture a model. """Cuda graph capture a model.
...@@ -1256,7 +1348,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): ...@@ -1256,7 +1348,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
max_batch_size = self.max_batchsize_to_capture max_batch_size = self.max_batchsize_to_capture
input_tokens = torch.zeros(max_batch_size, dtype=torch.long).cuda() input_tokens = torch.zeros(max_batch_size, dtype=torch.long).cuda()
input_positions = torch.zeros(max_batch_size, dtype=torch.long).cuda() input_positions = torch.zeros(max_batch_size, dtype=torch.long).cuda()
if self.model_is_mrope:
input_positions = torch.tile(input_positions, (3, 1))
# Prepare dummy previous_hidden_states only if needed by the model. # Prepare dummy previous_hidden_states only if needed by the model.
# This is used by draft models such as EAGLE. # This is used by draft models such as EAGLE.
previous_hidden_states = None previous_hidden_states = None
...@@ -1320,7 +1413,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): ...@@ -1320,7 +1413,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
"input_ids": "input_ids":
input_tokens[:batch_size], input_tokens[:batch_size],
"positions": "positions":
input_positions[:batch_size], input_positions[..., :batch_size],
"hidden_or_intermediate_states": "hidden_or_intermediate_states":
hidden_or_intermediate_states[ hidden_or_intermediate_states[
virtual_engine] # type: ignore virtual_engine] # type: ignore
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment