Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
58170d65
Unverified
Commit
58170d65
authored
Nov 11, 2024
by
Isotr0py
Committed by
GitHub
Nov 11, 2024
Browse files
[Hardware][CPU] Add embedding models support for CPU backend (#10193)
Signed-off-by:
Isotr0py
<
2037008807@qq.com
>
parent
9804ac7c
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
185 additions
and
52 deletions
+185
-52
.buildkite/run-cpu-test-ppc64le.sh
.buildkite/run-cpu-test-ppc64le.sh
+1
-2
.buildkite/run-cpu-test.sh
.buildkite/run-cpu-test.sh
+1
-2
tests/models/embedding/language/test_embedding.py
tests/models/embedding/language/test_embedding.py
+4
-3
vllm/attention/backends/torch_sdpa.py
vllm/attention/backends/torch_sdpa.py
+10
-4
vllm/model_executor/models/bert.py
vllm/model_executor/models/bert.py
+0
-6
vllm/worker/cpu_embedding_model_runner.py
vllm/worker/cpu_embedding_model_runner.py
+122
-0
vllm/worker/cpu_enc_dec_model_runner.py
vllm/worker/cpu_enc_dec_model_runner.py
+5
-6
vllm/worker/cpu_model_runner.py
vllm/worker/cpu_model_runner.py
+35
-22
vllm/worker/cpu_worker.py
vllm/worker/cpu_worker.py
+7
-7
No files found.
.buildkite/run-cpu-test-ppc64le.sh
View file @
58170d65
...
...
@@ -25,8 +25,7 @@ function cpu_tests() {
decord einops librosa peft Pillow sentence-transformers soundfile
\
transformers_stream_generator matplotlib datamodel_code_generator
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
# Embedding models are not supported for CPU yet
# pytest -v -s tests/models/embedding/language
pytest -v -s tests/models/embedding/language
pytest -v -s tests/models/encoder_decoder/language
pytest -v -s tests/models/decoder_only/language/test_models.py
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
...
...
.buildkite/run-cpu-test.sh
View file @
58170d65
...
...
@@ -32,8 +32,7 @@ function cpu_tests() {
decord einops librosa peft Pillow sentence-transformers soundfile
\
transformers_stream_generator matplotlib datamodel_code_generator
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
# Embedding models are not supported for CPU yet
# pytest -v -s tests/models/embedding/language
pytest -v -s tests/models/embedding/language
pytest -v -s tests/models/encoder_decoder/language
pytest -v -s tests/models/decoder_only/language/test_models.py
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
...
...
tests/models/embedding/language/test_embedding.py
View file @
58170d65
...
...
@@ -4,6 +4,8 @@ Run `pytest tests/models/embedding/language/test_embedding.py`.
"""
import
pytest
from
vllm.utils
import
current_platform
from
..utils
import
check_embeddings_close
# Model, Guard
...
...
@@ -21,15 +23,14 @@ ENCODER_ONLY = [
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_models
(
monkeypatch
,
hf_runner
,
vllm_runner
,
example_prompts
,
model
,
dtype
:
str
,
)
->
None
:
if
model
in
ENCODER_ONLY
:
monkeypatch
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"XFORMERS
"
)
if
model
not
in
ENCODER_ONLY
and
current_platform
.
is_cpu
()
:
pytest
.
skip
(
"Skip large embedding models test on CPU.
"
)
# The example_prompts has ending "\n", for example:
# "Write a short story about a robot that dreams for the first time.\n"
...
...
vllm/attention/backends/torch_sdpa.py
View file @
58170d65
...
...
@@ -158,7 +158,8 @@ class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata):
* Appropriate sequence lengths tensor for key & value
'''
if
attn_type
==
AttentionType
.
DECODER
:
if
(
attn_type
==
AttentionType
.
DECODER
or
attn_type
==
AttentionType
.
ENCODER_ONLY
):
seq_lens_q
=
self
.
seq_lens
seq_lens_kv
=
self
.
seq_lens
elif
attn_type
==
AttentionType
.
ENCODER
:
...
...
@@ -189,7 +190,8 @@ class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata):
* Appropriate attention bias value given the attention type
'''
if
attn_type
==
AttentionType
.
DECODER
:
if
(
attn_type
==
AttentionType
.
DECODER
or
attn_type
==
AttentionType
.
ENCODER_ONLY
):
return
self
.
attn_bias
elif
attn_type
==
AttentionType
.
ENCODER
:
return
self
.
encoder_attn_bias
...
...
@@ -215,7 +217,8 @@ class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata):
encoder/decoder cross-attention
'''
if
attn_type
==
AttentionType
.
DECODER
:
if
(
attn_type
==
AttentionType
.
DECODER
or
attn_type
==
AttentionType
.
ENCODER_ONLY
):
self
.
attn_bias
=
attn_bias
elif
attn_type
==
AttentionType
.
ENCODER
:
self
.
encoder_attn_bias
=
attn_bias
...
...
@@ -252,7 +255,8 @@ class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata):
* Appropriate block tables (or None)
'''
if
attn_type
==
AttentionType
.
DECODER
:
if
(
attn_type
==
AttentionType
.
DECODER
or
attn_type
==
AttentionType
.
ENCODER_ONLY
):
# Decoder self-attention
# Choose max_seq_len based on whether we are in prompt_run
return
(
self
.
seq_lens_tensor
,
self
.
max_decode_seq_len
,
...
...
@@ -420,6 +424,8 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
"Torch SDPA backend doesn't support prefix decoding."
)
if
decode_meta
:
=
attn_metadata
.
decode_metadata
:
assert
attn_type
!=
AttentionType
.
ENCODER_ONLY
,
(
"Encoder-only models should not have decode metadata."
)
# Decoding run.
(
seq_lens_arg
,
...
...
vllm/model_executor/models/bert.py
View file @
58170d65
...
...
@@ -5,7 +5,6 @@ from torch import nn
from
transformers
import
BertConfig
from
vllm.attention
import
Attention
,
AttentionMetadata
,
AttentionType
from
vllm.attention.backends.xformers
import
XFormersImpl
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_fn
...
...
@@ -218,11 +217,6 @@ class BertSelfAttention(nn.Module):
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.attn"
)
if
not
isinstance
(
self
.
attn
.
impl
,
XFormersImpl
):
raise
ValueError
(
"Encoder-only models currently require XFORMERS attention "
"backend. Set VLLM_ATTENTION_BACKEND=XFORMERS to use BERT."
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
,
...
...
vllm/worker/cpu_embedding_model_runner.py
0 → 100644
View file @
58170d65
import
dataclasses
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
,
Type
,
Union
import
torch
from
vllm.model_executor.pooling_metadata
import
PoolingMetadata
from
vllm.multimodal
import
MultiModalKwargs
from
vllm.pooling_params
import
PoolingParams
from
vllm.sequence
import
(
IntermediateTensors
,
PoolerOutput
,
SequenceData
,
SequenceGroupMetadata
)
from
vllm.worker.cpu_model_runner
import
(
CPUModelRunnerBase
,
ModelInputForCPU
,
ModelInputForCPUBuilder
)
@
dataclasses
.
dataclass
(
frozen
=
True
)
class
ModelInputForCPUWithPoolingMetadata
(
ModelInputForCPU
):
"""
Used by the CPUEmbeddingModelRunner.
"""
pooling_metadata
:
Optional
[
"PoolingMetadata"
]
=
None
class
CPUEmbeddingModelRunner
(
CPUModelRunnerBase
[
ModelInputForCPUWithPoolingMetadata
]):
_model_input_cls
:
Type
[
ModelInputForCPUWithPoolingMetadata
]
=
(
ModelInputForCPUWithPoolingMetadata
)
_builder_cls
:
Type
[
ModelInputForCPUBuilder
]
=
ModelInputForCPUBuilder
@
torch
.
inference_mode
()
def
execute_model
(
self
,
model_input
:
ModelInputForCPUWithPoolingMetadata
,
kv_caches
:
List
[
torch
.
Tensor
],
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
num_steps
:
int
=
1
,
)
->
Optional
[
Union
[
List
[
PoolerOutput
],
IntermediateTensors
]]:
if
num_steps
>
1
:
raise
ValueError
(
"CPU worker does not support multi-step execution."
)
num_layers
=
self
.
model_config
.
get_num_layers
(
self
.
parallel_config
)
# use an empty tensor instead of `None`` to force Dynamo to pass
# it by reference, rather by specializing on the value ``None``.
# the `dtype` argument does not matter, and we use `float32` as
# a placeholder (it has wide hardware support).
kv_caches
=
[
torch
.
tensor
([],
dtype
=
torch
.
float32
,
device
=
self
.
device
)
for
_
in
range
(
num_layers
)
]
model_executable
=
self
.
model
execute_model_kwargs
=
{
"input_ids"
:
model_input
.
input_tokens
,
"positions"
:
model_input
.
input_positions
,
"kv_caches"
:
kv_caches
,
"attn_metadata"
:
model_input
.
attn_metadata
,
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
device
=
self
.
device
),
"intermediate_tensors"
:
intermediate_tensors
,
}
hidden_states
=
model_executable
(
**
execute_model_kwargs
)
return
[
self
.
model
.
pooler
(
hidden_states
=
hidden_states
,
pooling_metadata
=
model_input
.
pooling_metadata
)
]
def
make_model_input_from_broadcasted_tensor_dict
(
self
,
tensor_dict
:
Dict
[
str
,
Any
])
->
ModelInputForCPUWithPoolingMetadata
:
return
ModelInputForCPUWithPoolingMetadata
.
from_broadcasted_tensor_dict
(
tensor_dict
,
attn_backend
=
self
.
attn_backend
,
)
def
prepare_model_input
(
self
,
seq_group_metadata_list
:
Optional
[
List
[
SequenceGroupMetadata
]],
virtual_engine
:
int
=
0
,
finished_requests_ids
:
Optional
[
List
[
str
]]
=
None
)
->
ModelInputForCPUWithPoolingMetadata
:
assert
seq_group_metadata_list
is
not
None
model_input
=
self
.
_prepare_model_input_tensors
(
seq_group_metadata_list
,
finished_requests_ids
)
# Prepare PoolingMetadata.
assert
model_input
.
seq_lens
is
not
None
pooling_metadata
=
self
.
_prepare_pooling
(
seq_group_metadata_list
,
model_input
.
seq_lens
)
return
dataclasses
.
replace
(
model_input
,
pooling_metadata
=
pooling_metadata
)
def
_prepare_pooling
(
self
,
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
prompt_lens
:
List
[
int
],
)
->
PoolingMetadata
:
"""Prepare PoolingMetadata for the sequence group metadata list."""
seq_groups
:
List
[
Tuple
[
List
[
int
],
PoolingParams
]]
=
[]
for
i
,
seq_group_metadata
in
enumerate
(
seq_group_metadata_list
):
seq_ids
=
list
(
seq_group_metadata
.
seq_data
.
keys
())
pooling_params
=
seq_group_metadata
.
pooling_params
seq_groups
.
append
((
seq_ids
,
pooling_params
))
seq_data
:
Dict
[
int
,
SequenceData
]
=
{}
for
seq_group_metadata
in
seq_group_metadata_list
:
seq_data
.
update
(
seq_group_metadata
.
seq_data
)
pooling_metadata
=
PoolingMetadata
(
seq_groups
=
seq_groups
,
seq_data
=
seq_data
,
prompt_lens
=
prompt_lens
,
)
return
pooling_metadata
vllm/worker/cpu_enc_dec_model_runner.py
View file @
58170d65
...
...
@@ -8,7 +8,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput
from
vllm.multimodal
import
MultiModalKwargs
from
vllm.sequence
import
IntermediateTensors
,
SequenceGroupMetadata
from
vllm.utils
import
make_tensor_with_pad
from
vllm.worker.cpu_model_runner
import
(
CPUModelRunner
,
from
vllm.worker.cpu_model_runner
import
(
CPUModelRunner
Base
,
ModelInputForCPUBuilder
,
ModelInputForCPUWithSamplingMetadata
)
from
vllm.worker.model_runner_base
import
(
...
...
@@ -50,7 +50,8 @@ class EncoderDecoderModelInputForCPU(ModelInputForCPUWithSamplingMetadata):
super
().
from_broadcasted_tensor_dict
(
tensor_dict
,
attn_backend
))
class
CPUEncoderDecoderModelRunner
(
CPUModelRunner
):
class
CPUEncoderDecoderModelRunner
(
CPUModelRunnerBase
[
EncoderDecoderModelInputForCPU
]):
_model_input_cls
:
Type
[
EncoderDecoderModelInputForCPU
]
=
(
EncoderDecoderModelInputForCPU
)
_builder_cls
:
Type
[
ModelInputForCPUBuilder
]
=
ModelInputForCPUBuilder
...
...
@@ -87,10 +88,8 @@ class CPUEncoderDecoderModelRunner(CPUModelRunner):
virtual_engine
:
int
=
0
,
finished_requests_ids
:
Optional
[
List
[
str
]]
=
None
)
->
EncoderDecoderModelInputForCPU
:
model_input
=
super
().
prepare_model_input
(
seq_group_metadata_list
,
virtual_engine
,
finished_requests_ids
)
model_input
=
cast
(
EncoderDecoderModelInputForCPU
,
model_input
)
model_input
=
self
.
_prepare_model_input_tensors
(
seq_group_metadata_list
,
finished_requests_ids
)
(
attn_metadata
,
encoder_input_tokens_tensor
,
...
...
vllm/worker/cpu_model_runner.py
View file @
58170d65
...
...
@@ -2,7 +2,8 @@ import dataclasses
import
weakref
from
collections
import
defaultdict
from
dataclasses
import
dataclass
from
typing
import
TYPE_CHECKING
,
Any
,
Dict
,
List
,
Optional
,
Tuple
,
Type
,
Union
from
typing
import
(
TYPE_CHECKING
,
Any
,
Dict
,
List
,
Optional
,
Tuple
,
Type
,
TypeVar
,
Union
)
import
torch
from
torch
import
nn
...
...
@@ -31,6 +32,7 @@ if TYPE_CHECKING:
logger
=
init_logger
(
__name__
)
TModelInputForCPU
=
TypeVar
(
'TModelInputForCPU'
,
bound
=
"ModelInputForCPU"
)
_PAD_SLOT_ID
=
-
1
...
...
@@ -60,10 +62,10 @@ class ModelInputForCPU(ModelRunnerInputBase):
@
classmethod
def
from_broadcasted_tensor_dict
(
cls
:
Type
[
"
ModelInputForCPU
"
],
cls
:
Type
[
T
ModelInputForCPU
],
tensor_dict
:
Dict
[
str
,
Any
],
attn_backend
:
Optional
[
"AttentionBackend"
]
=
None
)
->
"
ModelInputForCPU
"
:
)
->
T
ModelInputForCPU
:
if
attn_backend
is
not
None
:
tensor_dict
=
_init_attn_metadata_from_tensor_dict
(
attn_backend
,
tensor_dict
)
...
...
@@ -255,11 +257,14 @@ class ModelInputForCPUBuilder(ModelRunnerInputBuilderBase[ModelInputForCPU]):
slot_mapping
.
append
(
_PAD_SLOT_ID
)
continue
block_number
=
block_table
[
i
//
self
.
block_size
]
# type: ignore
block_offset
=
i
%
self
.
block_size
# type: ignore
slot
=
block_number
*
self
.
block_size
+
block_offset
slot_mapping
.
append
(
slot
)
# For encoder-only models, the block_table is None,
# and there is no need to initialize the slot_mapping.
if
block_table
is
not
None
:
block_number
=
block_table
[
i
//
self
.
block_size
]
# type: ignore
block_offset
=
i
%
self
.
block_size
# type: ignore
slot
=
block_number
*
self
.
block_size
+
block_offset
slot_mapping
.
append
(
slot
)
if
any
(
input_mrope_positions
):
input_positions
=
None
# type: ignore
...
...
@@ -402,10 +407,12 @@ class ModelInputForCPUBuilder(ModelRunnerInputBuilderBase[ModelInputForCPU]):
)
class
CPUModelRunner
(
ModelRunnerBase
[
ModelInputForCPU
]):
_model_input_cls
:
Type
[
ModelInputForCPUWithSamplingMetadata
]
=
(
ModelInputForCPUWithSamplingMetadata
)
_builder_cls
:
Type
[
ModelInputForCPUBuilder
]
=
ModelInputForCPUBuilder
class
CPUModelRunnerBase
(
ModelRunnerBase
[
TModelInputForCPU
]):
"""
Helper class for shared methods between CPU model runners.
"""
_model_input_cls
:
Type
[
TModelInputForCPU
]
_builder_cls
:
Type
[
ModelInputForCPUBuilder
]
def
__init__
(
self
,
...
...
@@ -448,20 +455,11 @@ class CPUModelRunner(ModelRunnerBase[ModelInputForCPU]):
def
load_model
(
self
)
->
None
:
self
.
model
=
get_model
(
vllm_config
=
self
.
vllm_config
)
def
make_model_input_from_broadcasted_tensor_dict
(
self
,
tensor_dict
:
Dict
[
str
,
Any
],
)
->
ModelInputForCPUWithSamplingMetadata
:
return
ModelInputForCPUWithSamplingMetadata
.
from_broadcasted_tensor_dict
(
# noqa: E501
tensor_dict
,
attn_backend
=
self
.
attn_backend
,
)
def
_prepare_model_input_tensors
(
self
,
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
finished_requests_ids
:
Optional
[
List
[
str
]]
=
None
)
->
ModelInputForCPU
WithSamplingMetadata
:
)
->
T
ModelInputForCPU
:
"""Helper method to prepare the model input based on a given sequence
group. Prepares metadata needed for the base model forward pass but not
metadata for possible additional steps, e.g., sampling.
...
...
@@ -473,6 +471,21 @@ class CPUModelRunner(ModelRunnerBase[ModelInputForCPU]):
return
builder
.
build
()
# type: ignore
class
CPUModelRunner
(
CPUModelRunnerBase
[
ModelInputForCPUWithSamplingMetadata
]):
_model_input_cls
:
Type
[
ModelInputForCPUWithSamplingMetadata
]
=
(
ModelInputForCPUWithSamplingMetadata
)
_builder_cls
:
Type
[
ModelInputForCPUBuilder
]
=
ModelInputForCPUBuilder
def
make_model_input_from_broadcasted_tensor_dict
(
self
,
tensor_dict
:
Dict
[
str
,
Any
],
)
->
ModelInputForCPUWithSamplingMetadata
:
return
ModelInputForCPUWithSamplingMetadata
.
from_broadcasted_tensor_dict
(
# noqa: E501
tensor_dict
,
attn_backend
=
self
.
attn_backend
,
)
def
prepare_model_input
(
self
,
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
...
...
vllm/worker/cpu_worker.py
View file @
58170d65
...
...
@@ -14,8 +14,9 @@ from vllm.logger import init_logger
from
vllm.model_executor
import
set_random_seed
from
vllm.sequence
import
ExecuteModelRequest
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
vllm.worker.cpu_embedding_model_runner
import
CPUEmbeddingModelRunner
from
vllm.worker.cpu_enc_dec_model_runner
import
CPUEncoderDecoderModelRunner
from
vllm.worker.cpu_model_runner
import
CPUModelRunner
from
vllm.worker.cpu_model_runner
import
CPUModelRunner
,
CPUModelRunnerBase
from
vllm.worker.worker_base
import
(
LocalOrDistributedWorkerBase
,
LoraNotSupportedWorkerBase
,
WorkerBase
,
WorkerInput
)
...
...
@@ -150,21 +151,20 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
else
:
self
.
local_omp_cpuid
=
omp_cpuids
.
split
(
"|"
)[
rank
]
ModelRunnerClass
:
Type
[
CPUModelRunner
]
=
CPUModelRunner
ModelRunnerClass
:
Type
[
CPUModelRunner
Base
]
=
CPUModelRunner
if
self
.
model_config
.
task
==
"embedding"
:
raise
NotImplementedError
(
"Embedding models are not supported for CPU backend"
)
# ModelRunnerClass = CPUEmbeddingModelRunner
ModelRunnerClass
=
CPUEmbeddingModelRunner
elif
self
.
model_config
.
is_encoder_decoder
:
ModelRunnerClass
=
CPUEncoderDecoderModelRunner
self
.
model_runner
:
CPUModelRunner
=
ModelRunnerClass
(
self
.
model_runner
:
CPUModelRunner
Base
=
ModelRunnerClass
(
vllm_config
=
vllm_config
,
kv_cache_dtype
=
kv_cache_dtype
,
is_driver_worker
=
is_driver_worker
)
# Uninitialized cache engine. Will be initialized by
# initialize_cache.
self
.
cache_engine
:
List
[
CPUCacheEngine
]
self
.
cpu_cache
:
List
[
List
[
torch
.
Tensor
]]
# Initialize cpu_cache as embedding models don't initialize kv_caches
self
.
cpu_cache
:
Optional
[
List
[
List
[
torch
.
Tensor
]]]
=
None
# Torch profiler. Enabled and configured through env vars:
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment