Merge tag 'v0.5.2' into v0.5.2-dtk24.04.1

705f6a35 · zhuwenwen · af837396 · 4cf256ae · 705f6a35 · 705f6a35
Commit 705f6a35 authored Jul 16, 2024 by zhuwenwen
3 changed files
--- a/vllm/model_executor/model_loader/__init__.py
+++ b/vllm/model_executor/model_loader/__init__.py
@@ -3,8 +3,8 @@ from typing import Optional
 from torch import nn

 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         VisionLanguageConfig)
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         SchedulerConfig)
 from vllm.model_executor.model_loader.loader import (BaseModelLoader,
                                                     get_model_loader)
 from vllm.model_executor.model_loader.utils import (
@@ -15,13 +15,13 @@ def get_model(*, model_config: ModelConfig, load_config: LoadConfig,
              device_config: DeviceConfig, parallel_config: ParallelConfig,
              scheduler_config: SchedulerConfig,
              lora_config: Optional[LoRAConfig],
-              vision_language_config: Optional[VisionLanguageConfig],
+              multimodal_config: Optional[MultiModalConfig],
              cache_config: CacheConfig) -> nn.Module:
    loader = get_model_loader(load_config)
    return loader.load_model(model_config=model_config,
                             device_config=device_config,
                             lora_config=lora_config,
-                             vision_language_config=vision_language_config,
+                             multimodal_config=multimodal_config,
                             parallel_config=parallel_config,
                             scheduler_config=scheduler_config,
                             cache_config=cache_config)

--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -16,15 +16,15 @@ from huggingface_hub import HfApi, hf_hub_download
 from torch import nn

 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoadFormat,
-                         LoRAConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig, VisionLanguageConfig)
+                         LoRAConfig, ModelConfig, MultiModalConfig,
+                         ParallelConfig, SchedulerConfig)
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig)
 from vllm.model_executor.model_loader.tensorizer import (
    TensorizerConfig, is_vllm_tensorized, load_with_tensorizer,
-    tensorizer_weights_iterator)
+    serialize_vllm_model, tensorizer_weights_iterator)
 from vllm.model_executor.model_loader.utils import (get_model_architecture,
                                                    set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -32,8 +32,11 @@ from vllm.model_executor.model_loader.weight_utils import (
    filter_duplicate_safetensors_files, filter_files_not_needed_for_inference,
    get_quant_config, initialize_dummy_weights, np_cache_weights_iterator,
    pt_weights_iterator, safetensors_weights_iterator)
-from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
+from vllm.model_executor.models.interfaces import (supports_lora,
+                                                   supports_vision)
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+from vllm.utils import is_tpu

 logger = init_logger(__name__)

@@ -44,7 +47,7 @@ def _get_quantization_config(
    """Get the quantization config."""
    if model_config.quantization is not None:
        quant_config = get_quant_config(model_config, load_config)
-        capability = torch.cuda.get_device_capability()
+        capability = current_platform.get_device_capability()
        capability = capability[0] * 10 + capability[1]
        if capability < quant_config.get_min_capability():
            raise ValueError(
@@ -66,12 +69,15 @@ def _get_quantization_config(


 def _get_model_initialization_kwargs(
-        model_class: Type[nn.Module], lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig]
+    model_class: Type[nn.Module],
+    lora_config: Optional[LoRAConfig],
+    multimodal_config: Optional[MultiModalConfig],
 ) -> Dict[str, Any]:
    """Get extra kwargs for model initialization."""
-    extra_kwargs = {}
-    if hasattr(model_class, "supported_lora_modules"):
+    extra_kwargs: Dict[str, Any] = {}
+
+    if supports_lora(model_class):
+        # lora_config=None is used to disable LoRA
        extra_kwargs["lora_config"] = lora_config
    elif lora_config:
        raise ValueError(
@@ -79,19 +85,20 @@ def _get_model_initialization_kwargs(
            "but LoRA is enabled. Support for this model may "
            "be added in the future. If this is important to you, "
            "please open an issue on github.")
-    elif issubclass(model_class, VisionLanguageModelBase):
-        if vision_language_config is None:
-            raise ValueError("Provide `image_input_type` and other vision "
-                             "related configurations through LLM entrypoint "
-                             "or engine arguments.")

-        extra_kwargs["vision_language_config"] = vision_language_config
+    if supports_vision(model_class):
+        if multimodal_config is None:
+            raise ValueError("Provide vision related configurations "
+                             "through LLM entrypoint or engine arguments.")
+
+        extra_kwargs["multimodal_config"] = multimodal_config
+
    return extra_kwargs


 def _initialize_model(model_config: ModelConfig, load_config: LoadConfig,
                      lora_config: Optional[LoRAConfig],
-                      vision_language_config: Optional[VisionLanguageConfig],
+                      multimodal_config: Optional[MultiModalConfig],
                      cache_config: CacheConfig) -> nn.Module:
    """Initialize a model with the given configurations."""
    model_class = get_model_architecture(model_config)[0]
@@ -101,7 +108,7 @@ def _initialize_model(model_config: ModelConfig, load_config: LoadConfig,
                       cache_config=cache_config,
                       quant_config=quant_config,
                       **_get_model_initialization_kwargs(
-                           model_class, lora_config, vision_language_config))
+                           model_class, lora_config, multimodal_config))


 class BaseModelLoader(ABC):
@@ -114,7 +121,7 @@ class BaseModelLoader(ABC):
    def load_model(self, *, model_config: ModelConfig,
                   device_config: DeviceConfig,
                   lora_config: Optional[LoRAConfig],
-                   vision_language_config: Optional[VisionLanguageConfig],
+                   multimodal_config: Optional[MultiModalConfig],
                   parallel_config: ParallelConfig,
                   scheduler_config: SchedulerConfig,
                   cache_config: CacheConfig) -> nn.Module:
@@ -230,24 +237,38 @@ class DefaultModelLoader(BaseModelLoader):
        if self.load_config.load_format == LoadFormat.NPCACHE:
            # Currently np_cache only support *.bin checkpoints
            assert use_safetensors is False
-            return np_cache_weights_iterator(model_name_or_path,
-                                             self.load_config.download_dir,
-                                             hf_folder, hf_weights_files)
-        if use_safetensors:
-            return safetensors_weights_iterator(hf_weights_files)
-        return pt_weights_iterator(hf_weights_files)
+            weights_iterator = np_cache_weights_iterator(
+                model_name_or_path, self.load_config.download_dir, hf_folder,
+                hf_weights_files)
+        elif use_safetensors:
+            weights_iterator = safetensors_weights_iterator(hf_weights_files)
+        else:
+            weights_iterator = pt_weights_iterator(hf_weights_files)
+
+        if is_tpu():
+            # In PyTorch XLA, we should call `xm.mark_step` frequently so that
+            # not too many ops are accumulated in the XLA program.
+            import torch_xla.core.xla_model as xm
+
+            def _xla_weights_iterator(iterator: Generator):
+                for weights in iterator:
+                    yield weights
+                    xm.mark_step()
+
+            weights_iterator = _xla_weights_iterator(weights_iterator)
+        return weights_iterator

    def load_model(self, *, model_config: ModelConfig,
                   device_config: DeviceConfig,
                   lora_config: Optional[LoRAConfig],
-                   vision_language_config: Optional[VisionLanguageConfig],
+                   multimodal_config: Optional[MultiModalConfig],
                   parallel_config: ParallelConfig,
                   scheduler_config: SchedulerConfig,
                   cache_config: CacheConfig) -> nn.Module:
        with set_default_torch_dtype(model_config.dtype):
            with torch.device(device_config.device):
                model = _initialize_model(model_config, self.load_config,
-                                          lora_config, vision_language_config,
+                                          lora_config, multimodal_config,
                                          cache_config)
            model.load_weights(
                self._get_weights_iterator(model_config.model,
@@ -280,14 +301,14 @@ class DummyModelLoader(BaseModelLoader):
    def load_model(self, *, model_config: ModelConfig,
                   device_config: DeviceConfig,
                   lora_config: Optional[LoRAConfig],
-                   vision_language_config: Optional[VisionLanguageConfig],
+                   multimodal_config: Optional[MultiModalConfig],
                   parallel_config: ParallelConfig,
                   scheduler_config: SchedulerConfig,
                   cache_config: CacheConfig) -> nn.Module:
        with set_default_torch_dtype(model_config.dtype):
            with torch.device(device_config.device):
                model = _initialize_model(model_config, self.load_config,
-                                          lora_config, vision_language_config,
+                                          lora_config, multimodal_config,
                                          cache_config)
            # NOTE(woosuk): For accurate performance evaluation, we assign
            # random values to the weights.
@@ -321,7 +342,7 @@ class TensorizerLoader(BaseModelLoader):
        model_config: ModelConfig,
        device_config: DeviceConfig,
        lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig],
+        multimodal_config: Optional[MultiModalConfig],
        cache_config: CacheConfig,
    ) -> nn.Module:
        """Load a serialized model with tensorizer to the CPU.
@@ -334,7 +355,7 @@ class TensorizerLoader(BaseModelLoader):
        with set_default_torch_dtype(model_config.dtype):
            with torch.device(device_config.device):
                model = _initialize_model(model_config, self.load_config,
-                                          lora_config, vision_language_config,
+                                          lora_config, multimodal_config,
                                          cache_config)

            model.load_weights(self._get_weights_iterator())
@@ -345,7 +366,7 @@ class TensorizerLoader(BaseModelLoader):
        model_config: ModelConfig,
        device_config: DeviceConfig,
        lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig],
+        multimodal_config: Optional[MultiModalConfig],
        cache_config: CacheConfig,
    ) -> nn.Module:
        """Load a serialized model with tensorizer.
@@ -359,7 +380,7 @@ class TensorizerLoader(BaseModelLoader):
                quant_config = _get_quantization_config(
                    model_config, self.load_config)
                extra_kwargs = _get_model_initialization_kwargs(
-                    model_class, lora_config, vision_language_config)
+                    model_class, lora_config, multimodal_config)
                extra_kwargs["quant_config"] = quant_config
                extra_kwargs["cache_config"] = cache_config

@@ -374,22 +395,36 @@ class TensorizerLoader(BaseModelLoader):
    def load_model(self, *, model_config: ModelConfig,
                   device_config: DeviceConfig,
                   lora_config: Optional[LoRAConfig],
-                   vision_language_config: Optional[VisionLanguageConfig],
+                   multimodal_config: Optional[MultiModalConfig],
                   parallel_config: ParallelConfig,
                   scheduler_config: SchedulerConfig,
                   cache_config: CacheConfig) -> nn.Module:
        self._verify_config(model_config, parallel_config)

+        if parallel_config.tensor_parallel_size > 1:
+            from vllm.distributed import get_tensor_model_parallel_rank
+            self.tensorizer_config.tensorizer_uri = \
+                self.tensorizer_config.tensorizer_uri \
+                    % get_tensor_model_parallel_rank()
+
        if is_vllm_tensorized(self.tensorizer_config):
            return self._load_model_serialized(model_config, device_config,
-                                               lora_config,
-                                               vision_language_config,
+                                               lora_config, multimodal_config,
                                               cache_config)
        return self._load_model_serialized_cpu(model_config, device_config,
-                                               lora_config,
-                                               vision_language_config,
+                                               lora_config, multimodal_config,
                                               cache_config)

+    @staticmethod
+    def save_model(
+        model: torch.nn.Module,
+        tensorizer_config: TensorizerConfig,
+    ) -> None:
+        serialize_vllm_model(
+            model=model,
+            tensorizer_config=tensorizer_config,
+        )
+

 class ShardedStateLoader(BaseModelLoader):
    """
@@ -418,7 +453,8 @@ class ShardedStateLoader(BaseModelLoader):
        Filter out all tensors that share the same memory or a subset of the
        memory of another tensor.
        """
-        same_storage_groups = collections.defaultdict(list)
+        same_storage_groups: Dict[Any, List[Tuple[
+            str, torch.Tensor]]] = collections.defaultdict(list)
        for key, tensor in tensors.items():
            if tensor.numel():
                ptr = tensor.untyped_storage().data_ptr()
@@ -427,7 +463,7 @@ class ShardedStateLoader(BaseModelLoader):
        def get_end_ptr(tensor: torch.Tensor) -> int:
            return tensor.view(-1)[-1].data_ptr() + tensor.element_size()

-        result = {}
+        result: Dict[str, torch.Tensor] = {}
        for group in same_storage_groups.values():
            for k, t in group:
                a, b = t.data_ptr(), get_end_ptr(t)
@@ -459,7 +495,7 @@ class ShardedStateLoader(BaseModelLoader):
    def load_model(self, *, model_config: ModelConfig,
                   device_config: DeviceConfig,
                   lora_config: Optional[LoRAConfig],
-                   vision_language_config: Optional[VisionLanguageConfig],
+                   multimodal_config: Optional[MultiModalConfig],
                   parallel_config: ParallelConfig,
                   scheduler_config: SchedulerConfig,
                   cache_config: CacheConfig) -> nn.Module:
@@ -473,7 +509,7 @@ class ShardedStateLoader(BaseModelLoader):
        with set_default_torch_dtype(model_config.dtype):
            with torch.device(device_config.device):
                model = _initialize_model(model_config, self.load_config,
-                                          lora_config, vision_language_config,
+                                          lora_config, multimodal_config,
                                          cache_config)
            rank = get_tensor_model_parallel_rank()
            pattern = os.path.join(
@@ -769,14 +805,14 @@ class BitsAndBytesModelLoader(BaseModelLoader):
    def load_model(self, *, model_config: ModelConfig,
                   device_config: DeviceConfig,
                   lora_config: Optional[LoRAConfig],
-                   vision_language_config: Optional[VisionLanguageConfig],
+                   multimodal_config: Optional[MultiModalConfig],
                   parallel_config: ParallelConfig,
                   scheduler_config: SchedulerConfig,
                   cache_config: CacheConfig) -> nn.Module:
        with set_default_torch_dtype(model_config.dtype):
            with torch.device(device_config.device):
                model = _initialize_model(model_config, self.load_config,
-                                          lora_config, vision_language_config,
+                                          lora_config, multimodal_config,
                                          cache_config)

                self._load_weights(model_config, model)

--- a/vllm/model_executor/model_loader/openvino.py
+++ b/vllm/model_executor/model_loader/openvino.py
+# ruff: noqa: SIM117
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+import openvino as ov
+import torch
+from huggingface_hub import HfApi
+from openvino._offline_transformations import paged_attention_transformation
+from optimum.intel import OVModelForCausalLM
+from torch import nn
+
+import vllm.envs as envs
+from vllm.attention.backends.openvino import OpenVINOAttentionMetadata
+from vllm.config import DeviceConfig, ModelConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.logits_processor import (LogitsProcessor,
+                                                         _prune_hidden_states)
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import SamplerOutput
+
+logger = init_logger(__name__)
+
+
+def _flattenize_inputs(inputs):
+    """
+    Helper function for making nested inputs flattens
+    """
+    flatten_inputs = []
+    for input_data in inputs:
+        if input_data is None:
+            continue
+        if isinstance(input_data, (list, tuple)):
+            flatten_inputs.extend(_flattenize_inputs(input_data))
+        elif isinstance(input_data, dict):
+            flatten_inputs.extend(_flattenize_inputs(list(
+                input_data.values())))
+        else:
+            flatten_inputs.append(input_data)
+    return flatten_inputs
+
+
+def _modify_cache_parameters(model: ov.Model, kv_cache_dtype: ov.Type,
+                             is_cpu: bool):
+    # Apply hardware dependent modifications to KV tensors
+    for parameter in model.get_parameters():
+        input = parameter.get_output_tensor(0)
+        input_names = input.get_names()
+        if len(input_names) != 1:
+            continue
+        input_name = next(iter(input_names))
+        shape = parameter.get_partial_shape()
+        # use real block size if available, just a placeholder
+        # to provide the expected rank
+        x_size = 1
+        num_blocks = ov.Dimension()
+        block_size = ov.Dimension()
+        head_size = ov.Dimension()
+        # TODO: Negotiate required layout with plugins (CPU is ~OK, GPU is TBD),
+        # pass more parameters to this function to set more static dimensions
+        if input_name.startswith("key_cache."):
+            cpu_shape = [num_blocks, shape[1], block_size, head_size]
+            gpu_shape = [
+                num_blocks,
+                shape[1],
+                shape[2].get_length() //
+                x_size if shape[2].is_static else ov.Dimension(),
+                block_size,
+                x_size,
+            ]
+        elif input_name.startswith("value_cache."):
+            cpu_shape = [num_blocks, shape[1], block_size, head_size]
+            gpu_shape = [num_blocks, shape[1], shape[2], block_size]
+        else:
+            continue
+        parameter.set_partial_shape(
+            ov.PartialShape(cpu_shape if is_cpu else gpu_shape))
+        parameter.set_element_type(kv_cache_dtype)
+    model.validate_nodes_and_infer_types()
+
+
+def _require_model_export(model_id, revision=None, subfolder=None):
+    model_dir = Path(model_id)
+    if subfolder is not None:
+        model_dir = model_dir / subfolder
+    if model_dir.is_dir():
+        return (not (model_dir / "openvino_model.xml").exists()
+                or not (model_dir / "openvino_model.bin").exists())
+
+    hf_api = HfApi()
+    try:
+        model_info = hf_api.model_info(model_id, revision=revision or "main")
+        normalized_subfolder = (None if subfolder is None else
+                                Path(subfolder).as_posix())
+        model_files = [
+            file.rfilename for file in model_info.siblings
+            if normalized_subfolder is None
+            or file.rfilename.startswith(normalized_subfolder)
+        ]
+        ov_model_path = ("openvino_model.xml" if normalized_subfolder is None
+                         else f"{normalized_subfolder}/openvino_model.xml")
+        return (ov_model_path not in model_files
+                or ov_model_path.replace(".xml", ".bin") not in model_files)
+    except Exception:
+        return True
+
+
+class OpenVINOCasualLM(nn.Module):
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+        kv_cache_dtype: ov.Type,
+    ) -> None:
+        super().__init__()
+        self.logits_processor = LogitsProcessor(
+            model_config.hf_config.vocab_size, logits_as_input=True)
+        self.sampler = Sampler()
+
+        export = _require_model_export(model_config.model)
+        if export:
+            logger.warning(
+                f"Provided model id {model_config.model} does not "  # noqa: G004
+                "contain OpenVINO IR, the model will be converted to IR with "
+                "default options. If you need to use specific options for "
+                "model conversion, use optimum-cli export openvino with "
+                "desired options.")
+        else:
+            logger.warning(
+                "OpenVINO IR is available for provided model id "  # noqa: G004
+                f"{model_config.model}. This IR will be used for inference "
+                "as-is, all possible options that may affect model conversion "
+                "are ignored.")
+
+        load_in_8bit = envs.VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS
+        pt_model = OVModelForCausalLM.from_pretrained(
+            model_config.model,
+            export=export,
+            compile=False,
+            load_in_8bit=load_in_8bit,
+            trust_remote_code=model_config.trust_remote_code,
+        )
+
+        paged_attention_transformation(pt_model.model)
+        _modify_cache_parameters(pt_model.model, kv_cache_dtype,
+                                 device_config.device.type == "cpu")
+
+        core = ov.Core()
+        ov_compiled = core.compile_model(pt_model.model, "CPU")
+        self.ov_request = ov_compiled.create_infer_request()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[Tuple[ov.Tensor, ov.Tensor]],
+        attn_metadata: OpenVINOAttentionMetadata,
+    ) -> torch.Tensor:
+        flatten_kv_cache = _flattenize_inputs(kv_caches)
+
+        inputs = [
+            input_ids,
+            positions,
+            *flatten_kv_cache,
+            attn_metadata.past_lens,
+            attn_metadata.subsequence_begins,
+            attn_metadata.block_indices,
+            attn_metadata.block_indices_begins,
+            attn_metadata.max_context_len,
+        ]
+
+        self.ov_request.start_async(inputs, share_inputs=True)
+        self.ov_request.wait()
+
+        logits = torch.from_numpy(self.ov_request.get_tensor("logits").data)
+
+        # TODO: remove 'view' once OpenVINO PA will drop 'seq_len' dimension
+        return logits.view(-1, logits.shape[-1])
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        hidden_states = _prune_hidden_states(hidden_states, sampling_metadata)
+        logits = self.logits_processor(None, hidden_states, sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+
+def get_model(
+    model_config: ModelConfig,
+    device_config: DeviceConfig,
+    kv_cache_dtype: ov.Type,
+    **kwargs,
+) -> torch.nn.Module:
+    lora_config = kwargs.get("lora_config", None)
+    if lora_config:
+        raise ValueError(
+            "OpenVINO modeling does not support LoRA, "
+            "but LoRA is enabled. Support for this model may "
+            "be added in the future. If this is important to you, "
+            "please open an issue on github.")
+
+    return OpenVINOCasualLM(model_config, device_config, kv_cache_dtype)