Unverified Commit ffc0f2b4 authored by Peter Salas's avatar Peter Salas Committed by GitHub
Browse files

[Model][OpenVINO] Fix regressions from #8346 (#10045)


Signed-off-by: default avatarPeter Salas <peter@fixie.ai>
parent 82bfc38d
...@@ -11,4 +11,4 @@ trap remove_docker_container EXIT ...@@ -11,4 +11,4 @@ trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Run the image and launch offline inference # Run the image and launch offline inference
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
from dataclasses import dataclass from dataclasses import dataclass
from typing import List, Tuple, Type from typing import Dict, List, Optional, Tuple, Type
import openvino as ov import openvino as ov
import torch import torch
...@@ -7,6 +7,7 @@ import torch ...@@ -7,6 +7,7 @@ import torch
from vllm.attention.backends.abstract import (AttentionBackend, from vllm.attention.backends.abstract import (AttentionBackend,
AttentionMetadata) AttentionMetadata)
from vllm.attention.backends.utils import CommonAttentionState from vllm.attention.backends.utils import CommonAttentionState
from vllm.multimodal import MultiModalPlaceholderMap
def copy_cache_block(src_tensor: ov.Tensor, dst_tensor: ov.Tensor, def copy_cache_block(src_tensor: ov.Tensor, dst_tensor: ov.Tensor,
...@@ -128,3 +129,12 @@ class OpenVINOAttentionMetadata: ...@@ -128,3 +129,12 @@ class OpenVINOAttentionMetadata:
# Shape: scalar # Shape: scalar
# Type: i32 # Type: i32
max_context_len: torch.Tensor max_context_len: torch.Tensor
# The index maps that relate multi-modal embeddings to the corresponding
# placeholders.
#
# N.B. These aren't really related to attention and don't belong on this
# type -- this is just a temporary solution to make them available to
# `model_executable`.
multi_modal_placeholder_index_maps: Optional[Dict[
str, MultiModalPlaceholderMap.IndexMap]]
...@@ -21,8 +21,8 @@ from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, ...@@ -21,8 +21,8 @@ from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
split_tensor_along_last_dim, split_tensor_along_last_dim,
tensor_model_parallel_all_gather) tensor_model_parallel_all_gather)
from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
token_inputs) InputContext, token_inputs)
from vllm.model_executor import SamplingMetadata from vllm.model_executor import SamplingMetadata
from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
...@@ -915,7 +915,7 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int, ...@@ -915,7 +915,7 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
if "image_masks" in out: if "image_masks" in out:
dummy_imgdata["image_masks"] = out["image_masks"] dummy_imgdata["image_masks"] = out["image_masks"]
dummy_imgdata["seq_len"] = torch.tensor(seq_len, dtype=torch.long) dummy_imgdata["seq_len"] = torch.tensor(seq_len, dtype=torch.long)
return dummy_seqdata, {"image": dummy_imgdata} return DummyData(dummy_seqdata, {"image": dummy_imgdata})
def pad_images( def pad_images(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment