Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
705f6a35
Commit
705f6a35
authored
Jul 16, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.5.2' into v0.5.2-dtk24.04.1
parents
af837396
4cf256ae
Changes
443
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
293 additions
and
47 deletions
+293
-47
vllm/model_executor/model_loader/__init__.py
vllm/model_executor/model_loader/__init__.py
+4
-4
vllm/model_executor/model_loader/loader.py
vllm/model_executor/model_loader/loader.py
+79
-43
vllm/model_executor/model_loader/openvino.py
vllm/model_executor/model_loader/openvino.py
+210
-0
No files found.
Too many changes to show.
To preserve performance only
443 of 443+
files are displayed.
Plain diff
Email patch
vllm/model_executor/model_loader/__init__.py
View file @
705f6a35
...
...
@@ -3,8 +3,8 @@ from typing import Optional
from
torch
import
nn
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoadConfig
,
LoRAConfig
,
ModelConfig
,
ParallelConfig
,
Scheduler
Config
,
VisionLanguage
Config
)
ModelConfig
,
MultiModalConfig
,
Parallel
Config
,
Scheduler
Config
)
from
vllm.model_executor.model_loader.loader
import
(
BaseModelLoader
,
get_model_loader
)
from
vllm.model_executor.model_loader.utils
import
(
...
...
@@ -15,13 +15,13 @@ def get_model(*, model_config: ModelConfig, load_config: LoadConfig,
device_config
:
DeviceConfig
,
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
lora_config
:
Optional
[
LoRAConfig
],
vision_language
_config
:
Optional
[
VisionLanguage
Config
],
multimodal
_config
:
Optional
[
MultiModal
Config
],
cache_config
:
CacheConfig
)
->
nn
.
Module
:
loader
=
get_model_loader
(
load_config
)
return
loader
.
load_model
(
model_config
=
model_config
,
device_config
=
device_config
,
lora_config
=
lora_config
,
vision_language_config
=
vision_language
_config
,
multimodal_config
=
multimodal
_config
,
parallel_config
=
parallel_config
,
scheduler_config
=
scheduler_config
,
cache_config
=
cache_config
)
...
...
vllm/model_executor/model_loader/loader.py
View file @
705f6a35
...
...
@@ -16,15 +16,15 @@ from huggingface_hub import HfApi, hf_hub_download
from
torch
import
nn
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoadConfig
,
LoadFormat
,
LoRAConfig
,
ModelConfig
,
Paralle
lConfig
,
Schedu
le
r
Config
,
VisionLanguage
Config
)
LoRAConfig
,
ModelConfig
,
MultiModa
lConfig
,
Paral
le
l
Config
,
Scheduler
Config
)
from
vllm.envs
import
VLLM_USE_MODELSCOPE
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.model_loader.tensorizer
import
(
TensorizerConfig
,
is_vllm_tensorized
,
load_with_tensorizer
,
tensorizer_weights_iterator
)
serialize_vllm_model
,
tensorizer_weights_iterator
)
from
vllm.model_executor.model_loader.utils
import
(
get_model_architecture
,
set_default_torch_dtype
)
from
vllm.model_executor.model_loader.weight_utils
import
(
...
...
@@ -32,8 +32,11 @@ from vllm.model_executor.model_loader.weight_utils import (
filter_duplicate_safetensors_files
,
filter_files_not_needed_for_inference
,
get_quant_config
,
initialize_dummy_weights
,
np_cache_weights_iterator
,
pt_weights_iterator
,
safetensors_weights_iterator
)
from
vllm.model_executor.models.vlm_base
import
VisionLanguageModelBase
from
vllm.model_executor.models.interfaces
import
(
supports_lora
,
supports_vision
)
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.platforms
import
current_platform
from
vllm.utils
import
is_tpu
logger
=
init_logger
(
__name__
)
...
...
@@ -44,7 +47,7 @@ def _get_quantization_config(
"""Get the quantization config."""
if
model_config
.
quantization
is
not
None
:
quant_config
=
get_quant_config
(
model_config
,
load_config
)
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
current_platform
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
if
capability
<
quant_config
.
get_min_capability
():
raise
ValueError
(
...
...
@@ -66,12 +69,15 @@ def _get_quantization_config(
def
_get_model_initialization_kwargs
(
model_class
:
Type
[
nn
.
Module
],
lora_config
:
Optional
[
LoRAConfig
],
vision_language_config
:
Optional
[
VisionLanguageConfig
]
model_class
:
Type
[
nn
.
Module
],
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
)
->
Dict
[
str
,
Any
]:
"""Get extra kwargs for model initialization."""
extra_kwargs
=
{}
if
hasattr
(
model_class
,
"supported_lora_modules"
):
extra_kwargs
:
Dict
[
str
,
Any
]
=
{}
if
supports_lora
(
model_class
):
# lora_config=None is used to disable LoRA
extra_kwargs
[
"lora_config"
]
=
lora_config
elif
lora_config
:
raise
ValueError
(
...
...
@@ -79,19 +85,20 @@ def _get_model_initialization_kwargs(
"but LoRA is enabled. Support for this model may "
"be added in the future. If this is important to you, "
"please open an issue on github."
)
elif
issubclass
(
model_class
,
VisionLanguageModelBase
):
if
vision_language_config
is
None
:
raise
ValueError
(
"Provide `image_input_type` and other vision "
"related configurations through LLM entrypoint "
"or engine arguments."
)
extra_kwargs
[
"vision_language_config"
]
=
vision_language_config
if
supports_vision
(
model_class
):
if
multimodal_config
is
None
:
raise
ValueError
(
"Provide vision related configurations "
"through LLM entrypoint or engine arguments."
)
extra_kwargs
[
"multimodal_config"
]
=
multimodal_config
return
extra_kwargs
def
_initialize_model
(
model_config
:
ModelConfig
,
load_config
:
LoadConfig
,
lora_config
:
Optional
[
LoRAConfig
],
vision_language
_config
:
Optional
[
VisionLanguage
Config
],
multimodal
_config
:
Optional
[
MultiModal
Config
],
cache_config
:
CacheConfig
)
->
nn
.
Module
:
"""Initialize a model with the given configurations."""
model_class
=
get_model_architecture
(
model_config
)[
0
]
...
...
@@ -101,7 +108,7 @@ def _initialize_model(model_config: ModelConfig, load_config: LoadConfig,
cache_config
=
cache_config
,
quant_config
=
quant_config
,
**
_get_model_initialization_kwargs
(
model_class
,
lora_config
,
vision_language
_config
))
model_class
,
lora_config
,
multimodal
_config
))
class
BaseModelLoader
(
ABC
):
...
...
@@ -114,7 +121,7 @@ class BaseModelLoader(ABC):
def
load_model
(
self
,
*
,
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
vision_language
_config
:
Optional
[
VisionLanguage
Config
],
multimodal
_config
:
Optional
[
MultiModal
Config
],
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
cache_config
:
CacheConfig
)
->
nn
.
Module
:
...
...
@@ -230,24 +237,38 @@ class DefaultModelLoader(BaseModelLoader):
if
self
.
load_config
.
load_format
==
LoadFormat
.
NPCACHE
:
# Currently np_cache only support *.bin checkpoints
assert
use_safetensors
is
False
return
np_cache_weights_iterator
(
model_name_or_path
,
self
.
load_config
.
download_dir
,
hf_folder
,
hf_weights_files
)
if
use_safetensors
:
return
safetensors_weights_iterator
(
hf_weights_files
)
return
pt_weights_iterator
(
hf_weights_files
)
weights_iterator
=
np_cache_weights_iterator
(
model_name_or_path
,
self
.
load_config
.
download_dir
,
hf_folder
,
hf_weights_files
)
elif
use_safetensors
:
weights_iterator
=
safetensors_weights_iterator
(
hf_weights_files
)
else
:
weights_iterator
=
pt_weights_iterator
(
hf_weights_files
)
if
is_tpu
():
# In PyTorch XLA, we should call `xm.mark_step` frequently so that
# not too many ops are accumulated in the XLA program.
import
torch_xla.core.xla_model
as
xm
def
_xla_weights_iterator
(
iterator
:
Generator
):
for
weights
in
iterator
:
yield
weights
xm
.
mark_step
()
weights_iterator
=
_xla_weights_iterator
(
weights_iterator
)
return
weights_iterator
def
load_model
(
self
,
*
,
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
vision_language
_config
:
Optional
[
VisionLanguage
Config
],
multimodal
_config
:
Optional
[
MultiModal
Config
],
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
cache_config
:
CacheConfig
)
->
nn
.
Module
:
with
set_default_torch_dtype
(
model_config
.
dtype
):
with
torch
.
device
(
device_config
.
device
):
model
=
_initialize_model
(
model_config
,
self
.
load_config
,
lora_config
,
vision_language
_config
,
lora_config
,
multimodal
_config
,
cache_config
)
model
.
load_weights
(
self
.
_get_weights_iterator
(
model_config
.
model
,
...
...
@@ -280,14 +301,14 @@ class DummyModelLoader(BaseModelLoader):
def
load_model
(
self
,
*
,
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
vision_language
_config
:
Optional
[
VisionLanguage
Config
],
multimodal
_config
:
Optional
[
MultiModal
Config
],
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
cache_config
:
CacheConfig
)
->
nn
.
Module
:
with
set_default_torch_dtype
(
model_config
.
dtype
):
with
torch
.
device
(
device_config
.
device
):
model
=
_initialize_model
(
model_config
,
self
.
load_config
,
lora_config
,
vision_language
_config
,
lora_config
,
multimodal
_config
,
cache_config
)
# NOTE(woosuk): For accurate performance evaluation, we assign
# random values to the weights.
...
...
@@ -321,7 +342,7 @@ class TensorizerLoader(BaseModelLoader):
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
vision_language
_config
:
Optional
[
VisionLanguage
Config
],
multimodal
_config
:
Optional
[
MultiModal
Config
],
cache_config
:
CacheConfig
,
)
->
nn
.
Module
:
"""Load a serialized model with tensorizer to the CPU.
...
...
@@ -334,7 +355,7 @@ class TensorizerLoader(BaseModelLoader):
with
set_default_torch_dtype
(
model_config
.
dtype
):
with
torch
.
device
(
device_config
.
device
):
model
=
_initialize_model
(
model_config
,
self
.
load_config
,
lora_config
,
vision_language
_config
,
lora_config
,
multimodal
_config
,
cache_config
)
model
.
load_weights
(
self
.
_get_weights_iterator
())
...
...
@@ -345,7 +366,7 @@ class TensorizerLoader(BaseModelLoader):
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
vision_language
_config
:
Optional
[
VisionLanguage
Config
],
multimodal
_config
:
Optional
[
MultiModal
Config
],
cache_config
:
CacheConfig
,
)
->
nn
.
Module
:
"""Load a serialized model with tensorizer.
...
...
@@ -359,7 +380,7 @@ class TensorizerLoader(BaseModelLoader):
quant_config
=
_get_quantization_config
(
model_config
,
self
.
load_config
)
extra_kwargs
=
_get_model_initialization_kwargs
(
model_class
,
lora_config
,
vision_language
_config
)
model_class
,
lora_config
,
multimodal
_config
)
extra_kwargs
[
"quant_config"
]
=
quant_config
extra_kwargs
[
"cache_config"
]
=
cache_config
...
...
@@ -374,22 +395,36 @@ class TensorizerLoader(BaseModelLoader):
def
load_model
(
self
,
*
,
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
vision_language
_config
:
Optional
[
VisionLanguage
Config
],
multimodal
_config
:
Optional
[
MultiModal
Config
],
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
cache_config
:
CacheConfig
)
->
nn
.
Module
:
self
.
_verify_config
(
model_config
,
parallel_config
)
if
parallel_config
.
tensor_parallel_size
>
1
:
from
vllm.distributed
import
get_tensor_model_parallel_rank
self
.
tensorizer_config
.
tensorizer_uri
=
\
self
.
tensorizer_config
.
tensorizer_uri
\
%
get_tensor_model_parallel_rank
()
if
is_vllm_tensorized
(
self
.
tensorizer_config
):
return
self
.
_load_model_serialized
(
model_config
,
device_config
,
lora_config
,
vision_language_config
,
lora_config
,
multimodal_config
,
cache_config
)
return
self
.
_load_model_serialized_cpu
(
model_config
,
device_config
,
lora_config
,
vision_language_config
,
lora_config
,
multimodal_config
,
cache_config
)
@
staticmethod
def
save_model
(
model
:
torch
.
nn
.
Module
,
tensorizer_config
:
TensorizerConfig
,
)
->
None
:
serialize_vllm_model
(
model
=
model
,
tensorizer_config
=
tensorizer_config
,
)
class
ShardedStateLoader
(
BaseModelLoader
):
"""
...
...
@@ -418,7 +453,8 @@ class ShardedStateLoader(BaseModelLoader):
Filter out all tensors that share the same memory or a subset of the
memory of another tensor.
"""
same_storage_groups
=
collections
.
defaultdict
(
list
)
same_storage_groups
:
Dict
[
Any
,
List
[
Tuple
[
str
,
torch
.
Tensor
]]]
=
collections
.
defaultdict
(
list
)
for
key
,
tensor
in
tensors
.
items
():
if
tensor
.
numel
():
ptr
=
tensor
.
untyped_storage
().
data_ptr
()
...
...
@@ -427,7 +463,7 @@ class ShardedStateLoader(BaseModelLoader):
def
get_end_ptr
(
tensor
:
torch
.
Tensor
)
->
int
:
return
tensor
.
view
(
-
1
)[
-
1
].
data_ptr
()
+
tensor
.
element_size
()
result
=
{}
result
:
Dict
[
str
,
torch
.
Tensor
]
=
{}
for
group
in
same_storage_groups
.
values
():
for
k
,
t
in
group
:
a
,
b
=
t
.
data_ptr
(),
get_end_ptr
(
t
)
...
...
@@ -459,7 +495,7 @@ class ShardedStateLoader(BaseModelLoader):
def
load_model
(
self
,
*
,
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
vision_language
_config
:
Optional
[
VisionLanguage
Config
],
multimodal
_config
:
Optional
[
MultiModal
Config
],
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
cache_config
:
CacheConfig
)
->
nn
.
Module
:
...
...
@@ -473,7 +509,7 @@ class ShardedStateLoader(BaseModelLoader):
with
set_default_torch_dtype
(
model_config
.
dtype
):
with
torch
.
device
(
device_config
.
device
):
model
=
_initialize_model
(
model_config
,
self
.
load_config
,
lora_config
,
vision_language
_config
,
lora_config
,
multimodal
_config
,
cache_config
)
rank
=
get_tensor_model_parallel_rank
()
pattern
=
os
.
path
.
join
(
...
...
@@ -769,14 +805,14 @@ class BitsAndBytesModelLoader(BaseModelLoader):
def
load_model
(
self
,
*
,
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
vision_language
_config
:
Optional
[
VisionLanguage
Config
],
multimodal
_config
:
Optional
[
MultiModal
Config
],
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
cache_config
:
CacheConfig
)
->
nn
.
Module
:
with
set_default_torch_dtype
(
model_config
.
dtype
):
with
torch
.
device
(
device_config
.
device
):
model
=
_initialize_model
(
model_config
,
self
.
load_config
,
lora_config
,
vision_language
_config
,
lora_config
,
multimodal
_config
,
cache_config
)
self
.
_load_weights
(
model_config
,
model
)
...
...
vllm/model_executor/model_loader/openvino.py
0 → 100644
View file @
705f6a35
# ruff: noqa: SIM117
from
pathlib
import
Path
from
typing
import
List
,
Optional
,
Tuple
import
openvino
as
ov
import
torch
from
huggingface_hub
import
HfApi
from
openvino._offline_transformations
import
paged_attention_transformation
from
optimum.intel
import
OVModelForCausalLM
from
torch
import
nn
import
vllm.envs
as
envs
from
vllm.attention.backends.openvino
import
OpenVINOAttentionMetadata
from
vllm.config
import
DeviceConfig
,
ModelConfig
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.logits_processor
import
(
LogitsProcessor
,
_prune_hidden_states
)
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
SamplerOutput
logger
=
init_logger
(
__name__
)
def
_flattenize_inputs
(
inputs
):
"""
Helper function for making nested inputs flattens
"""
flatten_inputs
=
[]
for
input_data
in
inputs
:
if
input_data
is
None
:
continue
if
isinstance
(
input_data
,
(
list
,
tuple
)):
flatten_inputs
.
extend
(
_flattenize_inputs
(
input_data
))
elif
isinstance
(
input_data
,
dict
):
flatten_inputs
.
extend
(
_flattenize_inputs
(
list
(
input_data
.
values
())))
else
:
flatten_inputs
.
append
(
input_data
)
return
flatten_inputs
def
_modify_cache_parameters
(
model
:
ov
.
Model
,
kv_cache_dtype
:
ov
.
Type
,
is_cpu
:
bool
):
# Apply hardware dependent modifications to KV tensors
for
parameter
in
model
.
get_parameters
():
input
=
parameter
.
get_output_tensor
(
0
)
input_names
=
input
.
get_names
()
if
len
(
input_names
)
!=
1
:
continue
input_name
=
next
(
iter
(
input_names
))
shape
=
parameter
.
get_partial_shape
()
# use real block size if available, just a placeholder
# to provide the expected rank
x_size
=
1
num_blocks
=
ov
.
Dimension
()
block_size
=
ov
.
Dimension
()
head_size
=
ov
.
Dimension
()
# TODO: Negotiate required layout with plugins (CPU is ~OK, GPU is TBD),
# pass more parameters to this function to set more static dimensions
if
input_name
.
startswith
(
"key_cache."
):
cpu_shape
=
[
num_blocks
,
shape
[
1
],
block_size
,
head_size
]
gpu_shape
=
[
num_blocks
,
shape
[
1
],
shape
[
2
].
get_length
()
//
x_size
if
shape
[
2
].
is_static
else
ov
.
Dimension
(),
block_size
,
x_size
,
]
elif
input_name
.
startswith
(
"value_cache."
):
cpu_shape
=
[
num_blocks
,
shape
[
1
],
block_size
,
head_size
]
gpu_shape
=
[
num_blocks
,
shape
[
1
],
shape
[
2
],
block_size
]
else
:
continue
parameter
.
set_partial_shape
(
ov
.
PartialShape
(
cpu_shape
if
is_cpu
else
gpu_shape
))
parameter
.
set_element_type
(
kv_cache_dtype
)
model
.
validate_nodes_and_infer_types
()
def
_require_model_export
(
model_id
,
revision
=
None
,
subfolder
=
None
):
model_dir
=
Path
(
model_id
)
if
subfolder
is
not
None
:
model_dir
=
model_dir
/
subfolder
if
model_dir
.
is_dir
():
return
(
not
(
model_dir
/
"openvino_model.xml"
).
exists
()
or
not
(
model_dir
/
"openvino_model.bin"
).
exists
())
hf_api
=
HfApi
()
try
:
model_info
=
hf_api
.
model_info
(
model_id
,
revision
=
revision
or
"main"
)
normalized_subfolder
=
(
None
if
subfolder
is
None
else
Path
(
subfolder
).
as_posix
())
model_files
=
[
file
.
rfilename
for
file
in
model_info
.
siblings
if
normalized_subfolder
is
None
or
file
.
rfilename
.
startswith
(
normalized_subfolder
)
]
ov_model_path
=
(
"openvino_model.xml"
if
normalized_subfolder
is
None
else
f
"
{
normalized_subfolder
}
/openvino_model.xml"
)
return
(
ov_model_path
not
in
model_files
or
ov_model_path
.
replace
(
".xml"
,
".bin"
)
not
in
model_files
)
except
Exception
:
return
True
class
OpenVINOCasualLM
(
nn
.
Module
):
def
__init__
(
self
,
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
kv_cache_dtype
:
ov
.
Type
,
)
->
None
:
super
().
__init__
()
self
.
logits_processor
=
LogitsProcessor
(
model_config
.
hf_config
.
vocab_size
,
logits_as_input
=
True
)
self
.
sampler
=
Sampler
()
export
=
_require_model_export
(
model_config
.
model
)
if
export
:
logger
.
warning
(
f
"Provided model id
{
model_config
.
model
}
does not "
# noqa: G004
"contain OpenVINO IR, the model will be converted to IR with "
"default options. If you need to use specific options for "
"model conversion, use optimum-cli export openvino with "
"desired options."
)
else
:
logger
.
warning
(
"OpenVINO IR is available for provided model id "
# noqa: G004
f
"
{
model_config
.
model
}
. This IR will be used for inference "
"as-is, all possible options that may affect model conversion "
"are ignored."
)
load_in_8bit
=
envs
.
VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS
pt_model
=
OVModelForCausalLM
.
from_pretrained
(
model_config
.
model
,
export
=
export
,
compile
=
False
,
load_in_8bit
=
load_in_8bit
,
trust_remote_code
=
model_config
.
trust_remote_code
,
)
paged_attention_transformation
(
pt_model
.
model
)
_modify_cache_parameters
(
pt_model
.
model
,
kv_cache_dtype
,
device_config
.
device
.
type
==
"cpu"
)
core
=
ov
.
Core
()
ov_compiled
=
core
.
compile_model
(
pt_model
.
model
,
"CPU"
)
self
.
ov_request
=
ov_compiled
.
create_infer_request
()
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
Tuple
[
ov
.
Tensor
,
ov
.
Tensor
]],
attn_metadata
:
OpenVINOAttentionMetadata
,
)
->
torch
.
Tensor
:
flatten_kv_cache
=
_flattenize_inputs
(
kv_caches
)
inputs
=
[
input_ids
,
positions
,
*
flatten_kv_cache
,
attn_metadata
.
past_lens
,
attn_metadata
.
subsequence_begins
,
attn_metadata
.
block_indices
,
attn_metadata
.
block_indices_begins
,
attn_metadata
.
max_context_len
,
]
self
.
ov_request
.
start_async
(
inputs
,
share_inputs
=
True
)
self
.
ov_request
.
wait
()
logits
=
torch
.
from_numpy
(
self
.
ov_request
.
get_tensor
(
"logits"
).
data
)
# TODO: remove 'view' once OpenVINO PA will drop 'seq_len' dimension
return
logits
.
view
(
-
1
,
logits
.
shape
[
-
1
])
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
)
->
torch
.
Tensor
:
hidden_states
=
_prune_hidden_states
(
hidden_states
,
sampling_metadata
)
logits
=
self
.
logits_processor
(
None
,
hidden_states
,
sampling_metadata
)
return
logits
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
get_model
(
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
kv_cache_dtype
:
ov
.
Type
,
**
kwargs
,
)
->
torch
.
nn
.
Module
:
lora_config
=
kwargs
.
get
(
"lora_config"
,
None
)
if
lora_config
:
raise
ValueError
(
"OpenVINO modeling does not support LoRA, "
"but LoRA is enabled. Support for this model may "
"be added in the future. If this is important to you, "
"please open an issue on github."
)
return
OpenVINOCasualLM
(
model_config
,
device_config
,
kv_cache_dtype
)
Prev
1
…
19
20
21
22
23
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment