Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ffc0f2b4
Unverified
Commit
ffc0f2b4
authored
Nov 05, 2024
by
Peter Salas
Committed by
GitHub
Nov 06, 2024
Browse files
[Model][OpenVINO] Fix regressions from #8346 (#10045)
Signed-off-by:
Peter Salas
<
peter@fixie.ai
>
parent
82bfc38d
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
15 additions
and
5 deletions
+15
-5
.buildkite/run-openvino-test.sh
.buildkite/run-openvino-test.sh
+1
-1
vllm/attention/backends/openvino.py
vllm/attention/backends/openvino.py
+11
-1
vllm/model_executor/models/molmo.py
vllm/model_executor/models/molmo.py
+3
-3
No files found.
.buildkite/run-openvino-test.sh
View file @
ffc0f2b4
...
@@ -11,4 +11,4 @@ trap remove_docker_container EXIT
...
@@ -11,4 +11,4 @@ trap remove_docker_container EXIT
remove_docker_container
remove_docker_container
# Run the image and launch offline inference
# Run the image and launch offline inference
docker run
--network
host
--env
VLLM_OPENVINO_KVCACHE_SPACE
=
1
--name
openvino-test openvino-test python3 /workspace/
vllm/
examples/offline_inference.py
docker run
--network
host
--env
VLLM_OPENVINO_KVCACHE_SPACE
=
1
--name
openvino-test openvino-test python3 /workspace/examples/offline_inference.py
vllm/attention/backends/openvino.py
View file @
ffc0f2b4
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
List
,
Tuple
,
Type
from
typing
import
Dict
,
List
,
Optional
,
Tuple
,
Type
import
openvino
as
ov
import
openvino
as
ov
import
torch
import
torch
...
@@ -7,6 +7,7 @@ import torch
...
@@ -7,6 +7,7 @@ import torch
from
vllm.attention.backends.abstract
import
(
AttentionBackend
,
from
vllm.attention.backends.abstract
import
(
AttentionBackend
,
AttentionMetadata
)
AttentionMetadata
)
from
vllm.attention.backends.utils
import
CommonAttentionState
from
vllm.attention.backends.utils
import
CommonAttentionState
from
vllm.multimodal
import
MultiModalPlaceholderMap
def
copy_cache_block
(
src_tensor
:
ov
.
Tensor
,
dst_tensor
:
ov
.
Tensor
,
def
copy_cache_block
(
src_tensor
:
ov
.
Tensor
,
dst_tensor
:
ov
.
Tensor
,
...
@@ -128,3 +129,12 @@ class OpenVINOAttentionMetadata:
...
@@ -128,3 +129,12 @@ class OpenVINOAttentionMetadata:
# Shape: scalar
# Shape: scalar
# Type: i32
# Type: i32
max_context_len
:
torch
.
Tensor
max_context_len
:
torch
.
Tensor
# The index maps that relate multi-modal embeddings to the corresponding
# placeholders.
#
# N.B. These aren't really related to attention and don't belong on this
# type -- this is just a temporary solution to make them available to
# `model_executable`.
multi_modal_placeholder_index_maps
:
Optional
[
Dict
[
str
,
MultiModalPlaceholderMap
.
IndexMap
]]
vllm/model_executor/models/molmo.py
View file @
ffc0f2b4
...
@@ -21,8 +21,8 @@ from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
...
@@ -21,8 +21,8 @@ from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size
,
get_tensor_model_parallel_world_size
,
split_tensor_along_last_dim
,
split_tensor_along_last_dim
,
tensor_model_parallel_all_gather
)
tensor_model_parallel_all_gather
)
from
vllm.inputs
import
(
INPUT_REGISTRY
,
DecoderOnlyInputs
,
InputContext
,
from
vllm.inputs
import
(
INPUT_REGISTRY
,
DecoderOnlyInputs
,
DummyData
,
token_inputs
)
InputContext
,
token_inputs
)
from
vllm.model_executor
import
SamplingMetadata
from
vllm.model_executor
import
SamplingMetadata
from
vllm.model_executor.layers.activation
import
QuickGELU
,
SiluAndMul
from
vllm.model_executor.layers.activation
import
QuickGELU
,
SiluAndMul
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
...
@@ -915,7 +915,7 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
...
@@ -915,7 +915,7 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
if
"image_masks"
in
out
:
if
"image_masks"
in
out
:
dummy_imgdata
[
"image_masks"
]
=
out
[
"image_masks"
]
dummy_imgdata
[
"image_masks"
]
=
out
[
"image_masks"
]
dummy_imgdata
[
"seq_len"
]
=
torch
.
tensor
(
seq_len
,
dtype
=
torch
.
long
)
dummy_imgdata
[
"seq_len"
]
=
torch
.
tensor
(
seq_len
,
dtype
=
torch
.
long
)
return
dummy_seqdata
,
{
"image"
:
dummy_imgdata
}
return
DummyData
(
dummy_seqdata
,
{
"image"
:
dummy_imgdata
}
)
def
pad_images
(
def
pad_images
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment