Merge tag 'v0.6.1' into v0.6.1-dev

4851c202 · zhuwenwen · 9b902f9e · 3fd2b0d2 · 4851c202 · 4851c202
Commit 4851c202 authored Sep 13, 2024 by zhuwenwen
20 changed files
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -9,6 +9,7 @@ from .audio import AudioPlugin
 from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalInputs,
                   MultiModalPlugin, MultiModalTokensCalc, NestedTensors)
 from .image import ImagePlugin
+from .video import VideoPlugin
 logger = init_logger(__name__)
@@ -34,7 +35,7 @@ class MultiModalRegistry:
    :class:`~vllm.multimodal.MultiModalPlugin` for each modality.
    """
-    DEFAULT_PLUGINS = (ImagePlugin(), AudioPlugin())
+    DEFAULT_PLUGINS = (ImagePlugin(), AudioPlugin(), VideoPlugin())
    def __init__(
            self,

--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -4,6 +4,7 @@ from io import BytesIO
 from typing import Any, List, Optional, Tuple, TypeVar, Union
 import numpy as np
+import numpy.typing as npt
 from PIL import Image
 from vllm.connections import global_http_connection
@@ -187,6 +188,47 @@ def rescale_image_size(image: Image.Image,
    return image
+def try_import_video_packages() -> Any:
+    try:
+        import cv2
+    except ImportError:
+        raise ImportError(
+            "Please install vllm[video] for video support.") from None
+    return cv2
+def resize_video(frames: npt.NDArray, size: Tuple[int, int]) -> npt.NDArray:
+    cv2 = try_import_video_packages()
+    num_frames, _, _, channels = frames.shape
+    new_height, new_width = size
+    resized_frames = np.empty((num_frames, new_height, new_width, channels),
+                              dtype=frames.dtype)
+    for i, frame in enumerate(frames):
+        resized_frame = cv2.resize(frame, (new_width, new_height))
+        resized_frames[i] = resized_frame
+    return resized_frames
+def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
+    _, height, width, _ = frames.shape
+    new_height = int(height * size_factor)
+    new_width = int(width * size_factor)
+    return resize_video(frames, (new_height, new_width))
+def sample_frames_from_video(frames: npt.NDArray,
+                             num_frames: int) -> npt.NDArray:
+    total_frames = frames.shape[0]
+    if num_frames == -1:
+        return frames
+    else:
+        frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+        sampled_frames = frames[frame_indices, ...]
+        return sampled_frames
 # Utilities for input processors
 _T = TypeVar("_T", str, int)

--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
+from functools import lru_cache
+from typing import List, Union
+import numpy as np
+from vllm.config import ModelConfig
+from vllm.inputs.registry import InputContext
+from vllm.logger import init_logger
+from vllm.transformers_utils.image_processor import get_video_processor
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.utils import is_list_of
+from .base import MultiModalData, MultiModalInputs
+from .image import ImagePlugin
+logger = init_logger(__name__)
+cached_get_video_processor = lru_cache(get_video_processor)
+cached_get_tokenizer = lru_cache(get_tokenizer)
+VideoInput = Union[
+    "np.ndarray",  # single video input
+    List["np.ndarray"],
+    # TODO: support more types
+    # List[Image.Image], List[List[Image.Image]],
+    # "torch.Tensor",
+    # List["torch.Tensor"],
+    # List[List["np.ndarrray"]],
+    # List[List["torch.Tensor"]],
+]
+class VideoPlugin(ImagePlugin):
+    """Plugin for video data."""
+    def get_data_key(self) -> str:
+        return "video"
+    def _get_hf_video_processor(self, model_config: ModelConfig):
+        return cached_get_video_processor(
+            model_config.model,
+            trust_remote_code=model_config.trust_remote_code)
+    def _default_input_mapper(
+        self,
+        ctx: InputContext,
+        data: MultiModalData[object],
+    ) -> MultiModalInputs:
+        model_config = ctx.model_config
+        # single video input as np.ndarray
+        if isinstance(data, np.ndarray):
+            video_processor = self._get_hf_video_processor(model_config)
+            if video_processor is None:
+                raise RuntimeError("No HuggingFace processor is available "
+                                   "to process the image object")
+            try:
+                batch_data = video_processor(data, return_tensors="pt").data
+            except Exception:
+                logger.error("Failed to process image (%s)", data)
+                raise
+            return MultiModalInputs(batch_data)
+        elif is_list_of(data, np.ndarray):
+            raise NotImplementedError(
+                "Multi video for a prompt is not supported yet")
+        raise TypeError(f"Invalid video type: {type(data)}")
+    def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
+        return 4096
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -42,6 +42,13 @@ try:
 except Exception:
    pass
+is_cpu = False
+try:
+    from importlib.metadata import version
+    is_cpu = "cpu" in version("vllm")
+except Exception:
+    pass
 if is_tpu:
    # people might install pytorch built with cuda but run on tpu
    # so we need to check tpu first
@@ -53,6 +60,9 @@ elif is_cuda:
 elif is_rocm:
    from .rocm import RocmPlatform
    current_platform = RocmPlatform()
+elif is_cpu:
+    from .cpu import CpuPlatform
+    current_platform = CpuPlatform()
 else:
    current_platform = UnspecifiedPlatform()

--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
+import torch
+from .interface import Platform, PlatformEnum
+class CpuPlatform(Platform):
+    _enum = PlatformEnum.CPU
+    @staticmethod
+    def get_device_name(device_id: int = 0) -> str:
+        return "cpu"
+    @staticmethod
+    def inference_mode():
+        return torch.no_grad()
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
 import enum
-from typing import Tuple
+from typing import Optional, Tuple
 import torch
@@ -8,6 +8,7 @@ class PlatformEnum(enum.Enum):
    CUDA = enum.auto()
    ROCM = enum.auto()
    TPU = enum.auto()
+    CPU = enum.auto()
    UNSPECIFIED = enum.auto()
@@ -23,9 +24,12 @@ class Platform:
    def is_tpu(self) -> bool:
        return self._enum == PlatformEnum.TPU
+    def is_cpu(self) -> bool:
+        return self._enum == PlatformEnum.CPU
    @staticmethod
-    def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
+    def get_device_capability(device_id: int = 0) -> Optional[Tuple[int, int]]:
-        raise NotImplementedError
+        return None
    @staticmethod
    def get_device_name(device_id: int = 0) -> str:

--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
-from typing import Tuple
 import torch
 from .interface import Platform, PlatformEnum
@@ -8,10 +6,6 @@ from .interface import Platform, PlatformEnum
 class TpuPlatform(Platform):
    _enum = PlatformEnum.TPU
-    @staticmethod
-    def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
-        raise RuntimeError("TPU does not have device capability.")
    @staticmethod
    def inference_mode():
        return torch.no_grad()
--- a/vllm/prompt_adapter/models.py
+++ b/vllm/prompt_adapter/models.py
@@ -14,6 +14,7 @@ from vllm.config import PromptAdapterConfig
 from vllm.prompt_adapter.layers import (
    VocabParallelEmbeddingWithPromptAdapter)  # yapf: disable
 from vllm.prompt_adapter.layers import PromptAdapterMapping
+from vllm.prompt_adapter.utils import load_peft_weights
 logger = logging.getLogger(__name__)
@@ -90,7 +91,6 @@ class PromptAdapterModel(AdapterModel):
        config: PromptAdapterConfig,
        device: str = "cuda",
    ) -> "PromptAdapterModel":
-        from peft.utils import load_peft_weights
        if num_virtual_tokens > config.max_prompt_adapter_token:
            raise ValueError(

--- a/vllm/prompt_adapter/utils.py
+++ b/vllm/prompt_adapter/utils.py
+# code borrowed from: https://github.com/huggingface/peft/blob/v0.12.0/src/peft/utils/save_and_load.py#L420
+import os
+from typing import Optional
+import torch
+from huggingface_hub import file_exists, hf_hub_download
+from huggingface_hub.utils import EntryNotFoundError
+from safetensors.torch import load_file as safe_load_file
+WEIGHTS_NAME = "adapter_model.bin"
+SAFETENSORS_WEIGHTS_NAME = "adapter_model.safetensors"
+# Get current device name based on available devices
+def infer_device() -> str:
+    if torch.cuda.is_available():
+        return "cuda"
+    return "cpu"
+def load_peft_weights(model_id: str,
+                      device: Optional[str] = None,
+                      **hf_hub_download_kwargs) -> dict:
+    r"""
+    A helper method to load the PEFT weights from the HuggingFace Hub or locally
+    Args:
+        model_id (`str`):
+            The local path to the adapter weights or the name of the adapter to
+            load from the HuggingFace Hub.
+        device (`str`):
+            The device to load the weights onto.
+        hf_hub_download_kwargs (`dict`):
+            Additional arguments to pass to the `hf_hub_download` method when 
+            loading from the HuggingFace Hub.
+    """
+    path = (os.path.join(model_id, hf_hub_download_kwargs["subfolder"])
+            if hf_hub_download_kwargs.get("subfolder", None) is not None else
+            model_id)
+    if device is None:
+        device = infer_device()
+    if os.path.exists(os.path.join(path, SAFETENSORS_WEIGHTS_NAME)):
+        filename = os.path.join(path, SAFETENSORS_WEIGHTS_NAME)
+        use_safetensors = True
+    elif os.path.exists(os.path.join(path, WEIGHTS_NAME)):
+        filename = os.path.join(path, WEIGHTS_NAME)
+        use_safetensors = False
+    else:
+        token = hf_hub_download_kwargs.get("token", None)
+        if token is None:
+            token = hf_hub_download_kwargs.get("use_auth_token", None)
+        hub_filename = (os.path.join(hf_hub_download_kwargs["subfolder"],
+                                     SAFETENSORS_WEIGHTS_NAME)
+                        if hf_hub_download_kwargs.get("subfolder", None)
+                        is not None else SAFETENSORS_WEIGHTS_NAME)
+        has_remote_safetensors_file = file_exists(
+            repo_id=model_id,
+            filename=hub_filename,
+            revision=hf_hub_download_kwargs.get("revision", None),
+            repo_type=hf_hub_download_kwargs.get("repo_type", None),
+            token=token,
+        )
+        use_safetensors = has_remote_safetensors_file
+        if has_remote_safetensors_file:
+            # Priority 1: load safetensors weights
+            filename = hf_hub_download(
+                model_id,
+                SAFETENSORS_WEIGHTS_NAME,
+                **hf_hub_download_kwargs,
+            )
+        else:
+            try:
+                filename = hf_hub_download(model_id, WEIGHTS_NAME,
+                                           **hf_hub_download_kwargs)
+            except EntryNotFoundError:
+                raise ValueError(  # noqa: B904
+                    f"Can't find weights for {model_id} in {model_id} or \
+                    in the Hugging Face Hub. "
+                    f"Please check that the file {WEIGHTS_NAME} or \
+                    {SAFETENSORS_WEIGHTS_NAME} is present at {model_id}.")
+    if use_safetensors:
+        adapters_weights = safe_load_file(filename, device=device)
+    else:
+        adapters_weights = torch.load(filename,
+                                      map_location=torch.device(device))
+    return adapters_weights
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -165,6 +165,9 @@ class SequenceData(msgspec.Struct,
    # is called.
    _new_appended_tokens: List[int] = msgspec.field(default_factory=list)
+    # It is used to compute mrope_position_ids.
+    _mrope_position_delta: Optional[int] = None
    def __post_init__(self) -> None:
        assert self._prompt_token_ids.typecode == "l"
        assert self._output_token_ids.typecode == "l"
@@ -219,6 +222,14 @@ class SequenceData(msgspec.Struct,
        assert isinstance(self._output_token_ids, array)
        return self._output_token_ids
+    @property
+    def mrope_position_delta(self) -> Optional[int]:
+        return self._mrope_position_delta
+    @mrope_position_delta.setter
+    def mrope_position_delta(self, new_mrope_position_delta):
+        self._mrope_position_delta = new_mrope_position_delta
    def append_token_id(self, token_id: int, logprob: float) -> None:
        self._output_token_ids.append(token_id)
        self._new_appended_tokens.append(token_id)

--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -2,7 +2,6 @@ from typing import List, Optional
 import torch
-from vllm import _custom_ops as ops
 from vllm.model_executor.layers.sampler import SamplerOutput
 try:
@@ -116,18 +115,9 @@ class TP1DraftModelRunner(ModelRunner):
        # Update attn_metadata
        attn_metadata = model_input.attn_metadata
        assert isinstance(attn_metadata, FlashAttentionMetadata)
-        attn_metadata.advance_step(num_seqs, num_queries)
+        attn_metadata.advance_step(model_input, sampled_token_ids,
-        # Update GPU tensors
+                                   self.block_size, num_seqs, num_queries)
-        ops.advance_step(num_seqs=num_seqs,
-                         num_queries=num_queries,
-                         block_size=self.block_size,
-                         input_tokens=model_input.input_tokens,
-                         sampled_token_ids=sampled_token_ids,
-                         input_positions=model_input.input_positions,
-                         seq_lens=attn_metadata.seq_lens_tensor,
-                         slot_mapping=attn_metadata.slot_mapping,
-                         block_tables=attn_metadata.block_tables)
        # Update sampling_metadata
        sampling_metadata = model_input.sampling_metadata

--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
 import contextlib
+import enum
+import json
 from pathlib import Path
 from typing import Any, Dict, Optional, Type, Union
+from huggingface_hub import file_exists, hf_hub_download
 from transformers import GenerationConfig, PretrainedConfig
 from transformers.models.auto.image_processing_auto import (
    get_image_processor_config)
 from transformers.models.auto.modeling_auto import (
    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
+from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
+# yapf conflicts with isort for this block
+# yapf: disable
 from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
                                             EAGLEConfig, ExaoneConfig,
-                                             InternVLChatConfig, JAISConfig,
+                                             GraniteConfig, InternVLChatConfig,
-                                             MedusaConfig, MLPSpeculatorConfig,
+                                             JAISConfig, MedusaConfig,
-                                             MPTConfig, NemotronConfig,
+                                             MLPSpeculatorConfig, MPTConfig,
-                                             RWConfig, UltravoxConfig)
+                                             NemotronConfig, RWConfig,
+                                             UltravoxConfig)
+# yapf: enable
 from vllm.transformers_utils.utils import check_gguf_file
 if VLLM_USE_MODELSCOPE:
@@ -23,6 +31,8 @@ if VLLM_USE_MODELSCOPE:
 else:
    from transformers import AutoConfig
+MISTRAL_CONFIG_NAME = "params.json"
 logger = init_logger(__name__)
 _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
@@ -39,6 +49,9 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
    "internvl_chat": InternVLChatConfig,
    "nemotron": NemotronConfig,
    "ultravox": UltravoxConfig,
+    # Granite can be removed from here once we have upgraded to
+    # transformers 4.45+
+    "granite": GraniteConfig,
 }
 for name, cls in _CONFIG_REGISTRY.items():
@@ -46,6 +59,20 @@ for name, cls in _CONFIG_REGISTRY.items():
        AutoConfig.register(name, cls)
+class ConfigFormat(str, enum.Enum):
+    AUTO = "auto"
+    HF = "hf"
+    MISTRAL = "mistral"
+def file_or_path_exists(model: Union[str, Path], config_name, revision,
+                        token) -> bool:
+    if Path(model).exists():
+        return (Path(model) / config_name).is_file()
+    return file_exists(model, config_name, revision=revision, token=token)
 def get_config(
    model: Union[str, Path],
    trust_remote_code: bool,
@@ -53,38 +80,68 @@ def get_config(
    code_revision: Optional[str] = None,
    rope_scaling: Optional[dict] = None,
    rope_theta: Optional[float] = None,
+    config_format: ConfigFormat = ConfigFormat.AUTO,
    **kwargs,
 ) -> PretrainedConfig:
    # Separate model folder from file path for GGUF models
    is_gguf = check_gguf_file(model)
    if is_gguf:
        kwargs["gguf_file"] = Path(model).name
        model = Path(model).parent
-    try:
+    if config_format == ConfigFormat.AUTO:
-        config = AutoConfig.from_pretrained(
+        if is_gguf or file_or_path_exists(model,
-            model,
+                                          HF_CONFIG_NAME,
-            trust_remote_code=trust_remote_code,
+                                          revision=revision,
-            revision=revision,
+                                          token=kwargs.get("token")):
-            code_revision=code_revision,
+            config_format = ConfigFormat.HF
-            **kwargs)
+        elif file_or_path_exists(model,
-    except ValueError as e:
+                                 MISTRAL_CONFIG_NAME,
-        if (not trust_remote_code and
+                                 revision=revision,
-                "requires you to execute the configuration file" in str(e)):
+                                 token=kwargs.get("token")):
-            err_msg = (
+            config_format = ConfigFormat.MISTRAL
-                "Failed to load the model config. If the model is a custom "
+        else:
-                "model not yet available in the HuggingFace transformers "
+            raise ValueError(f"No supported config format found in {model}")
-                "library, consider setting `trust_remote_code=True` in LLM "
-                "or using the `--trust-remote-code` flag in the CLI.")
+    if config_format == ConfigFormat.HF:
-            raise RuntimeError(err_msg) from e
+        config_dict, _ = PretrainedConfig.get_config_dict(
+            model, revision=revision, code_revision=code_revision, **kwargs)
+        # Use custom model class if it's in our registry
+        model_type = config_dict.get("model_type")
+        if model_type in _CONFIG_REGISTRY:
+            config_class = _CONFIG_REGISTRY[model_type]
+            config = config_class.from_pretrained(model,
+                                                  revision=revision,
+                                                  code_revision=code_revision)
        else:
-            raise e
+            try:
-    if config.model_type in _CONFIG_REGISTRY:
+                config = AutoConfig.from_pretrained(
-        config_class = _CONFIG_REGISTRY[config.model_type]
+                    model,
-        config = config_class.from_pretrained(model,
+                    trust_remote_code=trust_remote_code,
-                                              revision=revision,
+                    revision=revision,
-                                              code_revision=code_revision)
+                    code_revision=code_revision,
+                    **kwargs,
+                )
+            except ValueError as e:
+                if (not trust_remote_code
+                        and "requires you to execute the configuration file"
+                        in str(e)):
+                    err_msg = (
+                        "Failed to load the model config. If the model "
+                        "is a custom model not yet available in the "
+                        "HuggingFace transformers library, consider setting "
+                        "`trust_remote_code=True` in LLM or using the "
+                        "`--trust-remote-code` flag in the CLI.")
+                    raise RuntimeError(err_msg) from e
+                else:
+                    raise e
+    elif config_format == ConfigFormat.MISTRAL:
+        config = load_params_config(model, revision)
+    else:
+        raise ValueError(f"Unsupported config format: {config_format}")
    # Special architecture mapping check for GGUF models
    if is_gguf:
@@ -94,16 +151,81 @@ def get_config(
        model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type]
        config.update({"architectures": [model_type]})
-    for key, value in [("rope_scaling", rope_scaling),
+    for key, value in [
-                       ("rope_theta", rope_theta)]:
+        ("rope_scaling", rope_scaling),
+        ("rope_theta", rope_theta),
+    ]:
        if value is not None:
-            logger.info("Updating %s from %r to %r", key,
+            logger.info(
-                        getattr(config, key, None), value)
+                "Updating %s from %r to %r",
+                key,
+                getattr(config, key, None),
+                value,
+            )
            config.update({key: value})
    return config
+def load_params_config(model, revision) -> PretrainedConfig:
+    # This function loads a params.json config which
+    # should be used when loading models in mistral format
+    config_file_name = "params.json"
+    config_path = Path(model) / config_file_name
+    if not config_path.is_file():
+        config_path = Path(
+            hf_hub_download(model, config_file_name, revision=revision))
+    with open(config_path, "r") as file:
+        config_dict = json.load(file)
+    config_mapping = {
+        "dim": "hidden_size",
+        "norm_eps": "rms_norm_eps",
+        "n_kv_heads": "num_key_value_heads",
+        "n_layers": "num_hidden_layers",
+        "n_heads": "num_attention_heads",
+        "hidden_dim": "intermediate_size",
+    }
+    def recurse_elems(elem: Any):
+        if isinstance(elem, dict):
+            config_dict = {}
+            for key, value in elem.items():
+                key = config_mapping.get(key, key)
+                config_dict[key] = recurse_elems(value)
+            return PretrainedConfig(**config_dict)
+        else:
+            return elem
+    config_dict["model_type"] = config_dict.get("model_type", "transformer")
+    config_dict["hidden_act"] = config_dict.get("activation", "silu")
+    config_dict["tie_word_embeddings"] = config_dict.get(
+        "tie_embeddings", False)
+    config_dict["max_seq_len"] = config_dict.get("max_seq_len", 128_000)
+    if config_dict.get("moe") is not None:
+        config_dict["architectures"] = ["MixtralForCausalLM"]
+    else:
+        config_dict["architectures"] = ["MistralForCausalLM"]
+    if config_dict.get("vision_encoder") is not None:
+        multimodal_config = config_dict.pop("vision_encoder")
+        config_dict = {
+            "text_config": config_dict,
+            "vision_config": multimodal_config
+        }
+        config_dict["architectures"] = ["PixtralForConditionalGeneration"]
+        config_dict["model_type"] = "pixtral"
+    config = recurse_elems(config_dict)
+    return config
 def get_hf_image_processor_config(
    model: Union[str, Path],
    revision: Optional[str] = None,
@@ -120,7 +242,7 @@ def get_hf_image_processor_config(
 def get_hf_text_config(config: PretrainedConfig):
    """Get the "sub" config relevant to llm for multi modal models.
-        No op for pure text models.
+    No op for pure text models.
    """
    if hasattr(config, "text_config"):
        # The code operates under the assumption that text_config should have

--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -6,6 +6,7 @@ from vllm.transformers_utils.configs.exaone import ExaoneConfig
 # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 # `FalconConfig` class from the official HuggingFace transformers library.
 from vllm.transformers_utils.configs.falcon import RWConfig
+from vllm.transformers_utils.configs.granite import GraniteConfig
 from vllm.transformers_utils.configs.internvl import InternVLChatConfig
 from vllm.transformers_utils.configs.jais import JAISConfig
 from vllm.transformers_utils.configs.medusa import MedusaConfig
@@ -27,4 +28,7 @@ __all__ = [
    "MLPSpeculatorConfig",
    "NemotronConfig",
    "UltravoxConfig",
+    # Granite can be removed from here once we have upgraded to
+    # transformers 4.45+
+    "GraniteConfig",
 ]
--- a/vllm/transformers_utils/image_processor.py
+++ b/vllm/transformers_utils/image_processor.py
 from typing import cast
+def get_video_processor(
+    processor_name: str,
+    trust_remote_code: bool = False,
+):
+    """
+    Gets a processor for the given model name via HuggingFace.
+    """
+    from transformers import AutoProcessor
+    try:
+        processor = AutoProcessor.from_pretrained(processor_name)
+        video_processor = processor.video_processor
+    except ValueError as e:
+        if not trust_remote_code:
+            err_msg = (
+                "Failed to load the processor. If the processor is "
+                "a custom processor not yet available in the HuggingFace "
+                "transformers library, consider setting "
+                "`trust_remote_code=True` in LLM or using the "
+                "`--trust-remote-code` flag in the CLI.")
+            raise RuntimeError(err_msg) from e
+        else:
+            raise e
+    return video_processor
 def get_image_processor(
    processor_name: str,
    *args,

--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
+from typing import cast
+def get_processor(
+    processor_name: str,
+    *args,
+    trust_remote_code: bool = False,
+    **kwargs,
+):
+    """Gets a processor for the given model name via HuggingFace."""
+    # don't put this import at the top level
+    # it will call torch.cuda.device_count()
+    from transformers import AutoProcessor
+    from transformers.processing_utils import ProcessorMixin
+    try:
+        processor = AutoProcessor.from_pretrained(
+            processor_name,
+            *args,
+            trust_remote_code=trust_remote_code,
+            **kwargs)
+    except ValueError as e:
+        # If the error pertains to the processor class not existing or not
+        # currently being imported, suggest using the --trust-remote-code flag.
+        # Unlike AutoTokenizer, AutoProcessor does not separate such errors
+        if not trust_remote_code:
+            err_msg = (
+                "Failed to load the processor. If the processor is "
+                "a custom processor not yet available in the HuggingFace "
+                "transformers library, consider setting "
+                "`trust_remote_code=True` in LLM or using the "
+                "`--trust-remote-code` flag in the CLI.")
+            raise RuntimeError(err_msg) from e
+        else:
+            raise e
+    return cast(ProcessorMixin, processor)
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -16,7 +16,7 @@ from mistral_common.tokens.tokenizers.tekken import (SpecialTokenPolicy,
                                                     Tekkenizer)
 if TYPE_CHECKING:
-    from vllm.entrypoints.chat_utils import ConversationMessage
+    from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 @dataclass
@@ -45,26 +45,25 @@ class MistralTokenizer:
    def __init__(self, tokenizer: PublicMistralTokenizer) -> None:
        self.mistral = tokenizer
        self.instruct = tokenizer.instruct_tokenizer
-        self.tokenizer = tokenizer.instruct_tokenizer.tokenizer
-        self.vocab_size = len(self.tokenizer.vocab())
+        tokenizer_ = tokenizer.instruct_tokenizer.tokenizer
+        if isinstance(tokenizer_, Tekkenizer):
-        assert isinstance(self.tokenizer,
-                          (Tekkenizer, SentencePieceTokenizer)), type(
-                              self.tokenizer)
-        if (is_tekken := isinstance(self.tokenizer, Tekkenizer)):
            # Make sure special tokens will not raise
-            self.tokenizer.special_token_policy = SpecialTokenPolicy.IGNORE
+            tokenizer_.special_token_policy = SpecialTokenPolicy.IGNORE
-        self._is_tekken = is_tekken
+            self._vocab = {
+                token: idx
+                for idx, token in enumerate(tokenizer_.vocab())
+            }
+        elif isinstance(tokenizer_, SentencePieceTokenizer):
+            self._vocab = {
+                token: idx
+                for idx, token in enumerate(tokenizer_.vocab())
+            }
+        else:
+            raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}")
-        # the following attributes are set to fit VLLM's design
+        self.tokenizer = tokenizer_
-        self.is_fast = True
-        self.chat_template = True
-        self.all_special_ids: List[Any] = []
-        self.all_special_tokens: List[Any] = []
-        self.all_special_tokens_extended: List[Any] = []
    @classmethod
    def from_pretrained(cls,
@@ -102,6 +101,38 @@ class MistralTokenizer:
                                         revision=revision)
        return tokenizer_file
+    # the following attributes are set to fit VLLM's design
+    @property
+    def all_special_tokens_extended(self) -> List[str]:
+        return []
+    @property
+    def all_special_tokens(self) -> List[str]:
+        return []
+    @property
+    def all_special_ids(self) -> List[int]:
+        return []
+    @property
+    def bos_token_id(self) -> int:
+        return self.tokenizer.bos_id
+    @property
+    def eos_token_id(self) -> int:
+        return self.tokenizer.eos_id
+    @property
+    def is_fast(self) -> bool:
+        return True
+    @property
+    def vocab_size(self) -> int:
+        return len(self._vocab)
+    def __len__(self) -> int:
+        return self.vocab_size
    def __call__(
        self,
        prompt: str,
@@ -117,31 +148,34 @@ class MistralTokenizer:
        return Encoding(input_ids=input_ids)
-    def get_added_vocab(self) -> List[str]:
+    def get_vocab(self) -> Dict[str, int]:
+        return self._vocab
+    def get_added_vocab(self) -> Dict[str, int]:
        # Mistral tokenizers have no added vocabulary
-        return []
+        return {}
    def encode(self, prompt: str) -> List[int]:
-        # `encode ` should only be used for prompt completion
+        # `encode` should only be used for prompt completion
        # it should never be used for chat_completion.
        # For chat completion use `apply_chat_template`
        return self.tokenizer.encode(prompt, bos=True, eos=False)
    def apply_chat_template(self,
-                            conversation: List["ConversationMessage"],
+                            messages: List["ChatCompletionMessageParam"],
                            tools: Optional[Dict[str, Any]] = None,
                            **kwargs) -> List[int]:
        assert tools is None, "`tools` are not yet supported."
        request = ChatCompletionRequest(
-            messages=conversation)  # type: ignore[type-var]
+            messages=messages)  # type: ignore[type-var]
        encoded = self.mistral.encode_chat_completion(request)
        # encode-decode to get clean prompt
        return encoded.tokens
    def convert_tokens_to_string(self, tokens: List[str]) -> str:
-        if self._is_tekken:
+        if isinstance(self.tokenizer, Tekkenizer):
            return "".join(tokens)
        else:
            return self.tokenizer.decode(tokens)  # type: ignore[arg-type]
@@ -151,14 +185,11 @@ class MistralTokenizer:
            ids = [ids]
        return self.tokenizer.decode(ids)
-    @property
-    def eos_token_id(self):
-        return self.tokenizer.eos_id
    def convert_ids_to_tokens(
-            self,
+        self,
-            ids: List[int],
+        ids: List[int],
-            skip_special_tokens: Optional[bool] = True) -> List[str]:
+        skip_special_tokens: bool = True,
+    ) -> List[str]:
        # TODO(Patrick) - potentially allow special tokens to not be skipped
        assert (
            skip_special_tokens
@@ -170,6 +201,3 @@ class MistralTokenizer:
        tokens = [self.tokenizer.id_to_piece(id) for id in ids]
        return tokens
-    def __len__(self):
-        return self.vocab_size
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1224,3 +1224,28 @@ async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args,
 def supports_dynamo() -> bool:
    base_torch_version = Version(Version(torch.__version__).base_version)
    return base_torch_version >= Version("2.4.0")
+class AtomicCounter:
+    """An atomic, thread-safe counter"""
+    def __init__(self, initial=0):
+        """Initialize a new atomic counter to given initial value"""
+        self._value = initial
+        self._lock = threading.Lock()
+    def inc(self, num=1):
+        """Atomically increment the counter by num and return the new value"""
+        with self._lock:
+            self._value += num
+            return self._value
+    def dec(self, num=1):
+        """Atomically decrement the counter by num and return the new value"""
+        with self._lock:
+            self._value -= num
+            return self._value
+    @property
+    def value(self):
+        return self._value
--- a/vllm/version.py
+++ b/vllm/version.py
@@ -9,4 +9,4 @@ except Exception as e:
                  stacklevel=2)
    __commit__ = "COMMIT_HASH_PLACEHOLDER"
-__version__ = "0.6.0"
+__version__ = "0.6.1"
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -207,7 +207,8 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
    def init_device(self) -> None:
        if self.local_omp_cpuid != "all":
-            torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
+            ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
+            logger.info(ret)
        self.init_distributed_environment()
        # Set random seed.

--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -30,6 +30,7 @@ from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata, SamplingMetadataCache
+from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
@@ -74,6 +75,10 @@ _NUM_WARMUP_ITERS = 2
 TModelInputForGPU = TypeVar('TModelInputForGPU', bound="ModelInputForGPU")
+# For now, bump up cache limits for recompilations during CUDA graph warmups.
+torch._dynamo.config.cache_size_limit = 128
+torch._dynamo.config.accumulated_cache_size_limit = 128
 @dataclass(frozen=True)
 class ModelInputForGPU(ModelRunnerInputBase):
@@ -181,6 +186,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
        def simple_reinit(self):
            self.input_tokens[0].clear()  # type: ignore
            self.input_positions[0].clear()  # type: ignore
+            self.mrope_input_positions = None  # type: ignore
            self.seq_lens[0] = 0  # type: ignore
            self.orig_seq_lens[0] = 0  # type: ignore
            self.query_lens[0] = 0  # type: ignore
@@ -206,6 +212,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
            # Input tokens and positions.
            input_tokens: Optional[List[List[int]]] = None,
            input_positions: Optional[List[List[int]]] = None,
+            mrope_input_positions: Optional[List[List[List[int]]]] = None,
            # The sequence length (may be capped to the sliding window).
            seq_lens: Optional[List[int]] = None,
@@ -266,6 +273,8 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
                        for seq_id in range(len(self.seq_ids)):
                            self.input_positions[seq_id].clear()
+                    self.mrope_input_positions = None
                    if seq_lens:
                        self.seq_lens = seq_lens
                    else:
@@ -327,6 +336,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
            else:
                self.input_tokens = input_tokens or []
                self.input_positions = input_positions or []
+                self.mrope_input_positions = mrope_input_positions or None
                self.seq_lens = seq_lens or []
                self.orig_seq_lens = orig_seq_lens or []
                self.query_lens = query_lens or []
@@ -357,6 +367,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
            self.input_tokens = [[] for _ in range(self.n_seqs)]
            self.input_positions = [[] for _ in range(self.n_seqs)]
+            self.mrope_input_positions = None
            self.seq_lens = [0] * self.n_seqs
            self.orig_seq_lens = [0] * self.n_seqs
            self.query_lens = [0] * self.n_seqs
@@ -493,6 +504,17 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
        inter_data.query_lens[
            seq_idx] = seq_len - context_len if inter_data.is_prompt else 1
+        if seq_data.mrope_position_delta is not None:
+            if inter_data.mrope_input_positions is None:
+                inter_data.mrope_input_positions = [None] * inter_data.n_seqs
+            inter_data.mrope_input_positions[
+                seq_idx] = MRotaryEmbedding.get_next_input_positions(
+                    seq_data.mrope_position_delta,
+                    context_len,
+                    seq_len,
+                )
    def _compute_for_prefix_cache_hit(
            self, inter_data: InterDataForSeqGroup, seq_idx: int,
            seq_group_metadata: SequenceGroupMetadata):
@@ -636,6 +658,40 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
        mm_kwargs = self.multi_modal_input_mapper(mm_data)
        inter_data.multi_modal_inputs = mm_kwargs
+        # special processing for mrope position deltas.
+        if self.runner.model_is_mrope:
+            image_grid_thw = mm_kwargs.get("image_grid_thw", None)
+            video_grid_thw = mm_kwargs.get("video_grid_thw", None)
+            assert image_grid_thw is not None or video_grid_thw is not None, (
+                "mrope embedding type requires multi-modal input mapper "
+                "returns 'image_grid_thw' or 'video_grid_thw'.")
+            hf_config = self.runner.model_config.hf_config
+            inter_data.mrope_input_positions = [None] * inter_data.n_seqs
+            for seq_idx in range(inter_data.n_seqs):
+                seq_data = seq_group_metadata.seq_data[
+                    inter_data.seq_ids[seq_idx]]
+                token_ids = seq_data.get_token_ids()
+                mrope_input_positions, mrope_position_delta = \
+                    MRotaryEmbedding.get_input_positions(
+                        token_ids,
+                        image_grid_thw=image_grid_thw,
+                        video_grid_thw=video_grid_thw,
+                        image_token_id=hf_config.image_token_id,
+                        video_token_id=hf_config.video_token_id,
+                        vision_start_token_id=hf_config.vision_start_token_id,
+                        vision_end_token_id=hf_config.vision_end_token_id,
+                        spatial_merge_size=hf_config.vision_config.
+                        spatial_merge_size,
+                        context_len=inter_data.context_lens[seq_idx],
+                    )
+                seq_data.mrope_position_delta = mrope_position_delta
+                inter_data.mrope_input_positions[
+                    seq_idx] = mrope_input_positions
    def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata):
        """Add a sequence group to the builder."""
        seq_ids = seq_group_metadata.seq_data.keys()
@@ -684,10 +740,27 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
            # prefix caching and there is no decode request.
            return self.model_input_cls()
-        input_positions = []
+        mrope_input_positions: Optional[List[List[int]]] = None
-        for inter_data in self.inter_data_list:
+        if any(inter_data.mrope_input_positions is not None
-            for cur_input_positions in inter_data.input_positions:
+               for inter_data in self.inter_data_list):
-                input_positions.extend(cur_input_positions)
+            mrope_input_positions = [[] for _ in range(3)]
+            for idx in range(3):
+                for inter_data in self.inter_data_list:
+                    msections = inter_data.mrope_input_positions
+                    if msections is None:
+                        for _seq_input_positions in inter_data.input_positions:
+                            mrope_input_positions[idx].extend(
+                                _seq_input_positions)
+                    else:
+                        for _seq_mrope_input_positions in msections:
+                            mrope_input_positions[idx].extend(
+                                _seq_mrope_input_positions[idx])
+            input_positions = None
+        else:
+            input_positions = []
+            for inter_data in self.inter_data_list:
+                for cur_input_positions in inter_data.input_positions:
+                    input_positions.extend(cur_input_positions)
        seq_lens = []
        max_decode_seq_len = 0
@@ -724,15 +797,24 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
        # Tokens and positions.
        if cuda_graph_pad_size:
            input_tokens.extend(itertools.repeat(0, cuda_graph_pad_size))
-            input_positions.extend(itertools.repeat(0, cuda_graph_pad_size))
        assert self.runner.device is not None
        input_tokens_tensor = async_tensor_h2d(input_tokens, torch.long,
                                               self.runner.device,
                                               self.runner.pin_memory)
-        input_positions_tensor = async_tensor_h2d(input_positions, torch.long,
+        if mrope_input_positions is not None:
-                                                  self.runner.device,
+            for idx in range(3):
-                                                  self.runner.pin_memory)
+                mrope_input_positions[idx].extend(
+                    itertools.repeat(0, cuda_graph_pad_size))
+            input_positions_tensor = async_tensor_h2d(mrope_input_positions,
+                                                      torch.long,
+                                                      self.runner.device,
+                                                      self.runner.pin_memory)
+        else:
+            input_positions.extend(itertools.repeat(0, cuda_graph_pad_size))
+            input_positions_tensor = async_tensor_h2d(input_positions,
+                                                      torch.long,
+                                                      self.runner.device,
+                                                      self.runner.pin_memory)
        # Sequence and query lengths.
        if cuda_graph_pad_size:
            seq_lens.extend(itertools.repeat(1, cuda_graph_pad_size))
@@ -982,9 +1064,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                    "This may lead to less accurate results!")
        if envs.VLLM_TEST_DYNAMO_GRAPH_CAPTURE and supports_dynamo():
-            self.model = torch.compile(self.model,
+            self.model = torch.compile(
-                                       fullgraph=True,
+                self.model,
-                                       backend="eager")
+                fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
+                backend="eager")
    def save_sharded_state(
        self,
@@ -1226,6 +1309,15 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
            raise RuntimeError("PromptAdapter is not enabled.")
        return self.prompt_adapter_manager.list_adapters()
+    @property
+    def model_is_mrope(self) -> bool:
+        """Detect if the model has "mrope" rope_scaling type.
+        mrope requires keep "rope_deltas" between prompt and decoding phases."""
+        rope_scaling = getattr(self.model_config.hf_config, "rope_scaling", {})
+        if rope_scaling is None:
+            return False
+        return rope_scaling.get("type", None) == "mrope"
    @torch.inference_mode()
    def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
        """Cuda graph capture a model.
@@ -1256,7 +1348,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
        max_batch_size = self.max_batchsize_to_capture
        input_tokens = torch.zeros(max_batch_size, dtype=torch.long).cuda()
        input_positions = torch.zeros(max_batch_size, dtype=torch.long).cuda()
+        if self.model_is_mrope:
+            input_positions = torch.tile(input_positions, (3, 1))
        # Prepare dummy previous_hidden_states only if needed by the model.
        # This is used by draft models such as EAGLE.
        previous_hidden_states = None
@@ -1320,7 +1413,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                        "input_ids":
                        input_tokens[:batch_size],
                        "positions":
-                        input_positions[:batch_size],
+                        input_positions[..., :batch_size],
                        "hidden_or_intermediate_states":
                        hidden_or_intermediate_states[
                            virtual_engine]  # type: ignore