openvino.py 7.53 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0

3
4
# ruff: noqa: SIM117
from pathlib import Path
5
from typing import Optional
6
7
8
9
10
11
12
13
14

import openvino as ov
import torch
from huggingface_hub import HfApi
from openvino._offline_transformations import paged_attention_transformation
from optimum.intel import OVModelForCausalLM
from torch import nn

import vllm.envs as envs
15
from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
16
from vllm.forward_context import get_forward_context
17
18
19
from vllm.logger import init_logger
from vllm.model_executor.layers.logits_processor import (LogitsProcessor,
                                                         _prune_hidden_states)
20
from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
21
from vllm.model_executor.sampling_metadata import SamplingMetadata
22
from vllm.platforms import current_platform
23
24
25
26

logger = init_logger(__name__)


27
def _flatten_inputs(inputs):
28
29
30
31
32
33
34
35
    """
    Helper function for making nested inputs flattens
    """
    flatten_inputs = []
    for input_data in inputs:
        if input_data is None:
            continue
        if isinstance(input_data, (list, tuple)):
36
            flatten_inputs.extend(_flatten_inputs(input_data))
37
        elif isinstance(input_data, dict):
38
            flatten_inputs.extend(_flatten_inputs(list(input_data.values())))
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
        else:
            flatten_inputs.append(input_data)
    return flatten_inputs


def _modify_cache_parameters(model: ov.Model, kv_cache_dtype: ov.Type,
                             is_cpu: bool):
    # Apply hardware dependent modifications to KV tensors
    for parameter in model.get_parameters():
        input = parameter.get_output_tensor(0)
        input_names = input.get_names()
        if len(input_names) != 1:
            continue
        input_name = next(iter(input_names))
        shape = parameter.get_partial_shape()
        # use real block size if available, just a placeholder
        # to provide the expected rank
        num_blocks = ov.Dimension()
        block_size = ov.Dimension()
        head_size = ov.Dimension()
        if input_name.startswith("key_cache."):
            cpu_shape = [num_blocks, shape[1], block_size, head_size]
61
            gpu_shape = [num_blocks, shape[1], shape[2], block_size]
62
63
        elif input_name.startswith("value_cache."):
            cpu_shape = [num_blocks, shape[1], block_size, head_size]
64
            gpu_shape = [num_blocks, shape[1], block_size, shape[2]]
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
        else:
            continue
        parameter.set_partial_shape(
            ov.PartialShape(cpu_shape if is_cpu else gpu_shape))
        parameter.set_element_type(kv_cache_dtype)
    model.validate_nodes_and_infer_types()


def _require_model_export(model_id, revision=None, subfolder=None):
    model_dir = Path(model_id)
    if subfolder is not None:
        model_dir = model_dir / subfolder
    if model_dir.is_dir():
        return (not (model_dir / "openvino_model.xml").exists()
                or not (model_dir / "openvino_model.bin").exists())

    hf_api = HfApi()
    try:
        model_info = hf_api.model_info(model_id, revision=revision or "main")
        normalized_subfolder = (None if subfolder is None else
                                Path(subfolder).as_posix())
        model_files = [
            file.rfilename for file in model_info.siblings
            if normalized_subfolder is None
            or file.rfilename.startswith(normalized_subfolder)
        ]
        ov_model_path = ("openvino_model.xml" if normalized_subfolder is None
                         else f"{normalized_subfolder}/openvino_model.xml")
        return (ov_model_path not in model_files
                or ov_model_path.replace(".xml", ".bin") not in model_files)
    except Exception:
        return True


99
class OpenVINOCausalLM(nn.Module):
100
101
102

    def __init__(
        self,
103
        ov_core: ov.Core,
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
        model_config: ModelConfig,
        kv_cache_dtype: ov.Type,
    ) -> None:
        super().__init__()
        self.logits_processor = LogitsProcessor(
            model_config.hf_config.vocab_size, logits_as_input=True)
        self.sampler = Sampler()

        export = _require_model_export(model_config.model)
        if export:
            logger.warning(
                f"Provided model id {model_config.model} does not "  # noqa: G004
                "contain OpenVINO IR, the model will be converted to IR with "
                "default options. If you need to use specific options for "
                "model conversion, use optimum-cli export openvino with "
                "desired options.")
        else:
            logger.warning(
                "OpenVINO IR is available for provided model id "  # noqa: G004
                f"{model_config.model}. This IR will be used for inference "
                "as-is, all possible options that may affect model conversion "
                "are ignored.")

127
128
        load_in_8bit = (envs.VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS
                        if export else False)
129
130
131
132
133
134
135
136
        pt_model = OVModelForCausalLM.from_pretrained(
            model_config.model,
            export=export,
            compile=False,
            load_in_8bit=load_in_8bit,
            trust_remote_code=model_config.trust_remote_code,
        )

137
        ov_device = envs.VLLM_OPENVINO_DEVICE
138
139
        paged_attention_transformation(pt_model.model)
        _modify_cache_parameters(pt_model.model, kv_cache_dtype,
140
                                 current_platform.is_openvino_cpu())
141

142
        ov_compiled = ov_core.compile_model(pt_model.model, ov_device)
143
144
145
146
147
148
        self.ov_request = ov_compiled.create_infer_request()

    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
149
        kv_caches: list[tuple[ov.Tensor, ov.Tensor]],
150
    ) -> torch.Tensor:
151
152
        flat_kv_caches = _flatten_inputs(kv_caches)
        attn_metadata = get_forward_context().attn_metadata
153
154
155
156

        inputs = [
            input_ids,
            positions,
157
            *flat_kv_caches,
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
            attn_metadata.past_lens,
            attn_metadata.subsequence_begins,
            attn_metadata.block_indices,
            attn_metadata.block_indices_begins,
            attn_metadata.max_context_len,
        ]

        self.ov_request.start_async(inputs, share_inputs=True)
        self.ov_request.wait()

        logits = torch.from_numpy(self.ov_request.get_tensor("logits").data)

        # TODO: remove 'view' once OpenVINO PA will drop 'seq_len' dimension
        return logits.view(-1, logits.shape[-1])

    def compute_logits(self, hidden_states: torch.Tensor,
                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
        hidden_states = _prune_hidden_states(hidden_states, sampling_metadata)
        logits = self.logits_processor(None, hidden_states, sampling_metadata)
        return logits

    def sample(
        self,
        logits: torch.Tensor,
        sampling_metadata: SamplingMetadata,
    ) -> Optional[SamplerOutput]:
        next_tokens = self.sampler(logits, sampling_metadata)
        return next_tokens


def get_model(
189
    vllm_config: VllmConfig,
190
191
192
    kv_cache_dtype: ov.Type,
    **kwargs,
) -> torch.nn.Module:
193
    lora_config = kwargs.get("lora_config")
194
    ov_core = kwargs.get("ov_core")
195
196
197
198
199
200
201
    if lora_config:
        raise ValueError(
            "OpenVINO modeling does not support LoRA, "
            "but LoRA is enabled. Support for this model may "
            "be added in the future. If this is important to you, "
            "please open an issue on github.")

202
203
204
    with set_current_vllm_config(vllm_config):
        return OpenVINOCausalLM(ov_core, vllm_config.model_config,
                                kv_cache_dtype)