Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3ddbe255
Unverified
Commit
3ddbe255
authored
Oct 22, 2024
by
wangshuai09
Committed by
GitHub
Oct 22, 2024
Browse files
[Hardware][CPU] using current_platform.is_cpu (#9536)
parent
0d02747f
Changes
17
Show whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
60 additions
and
64 deletions
+60
-64
tests/conftest.py
tests/conftest.py
+4
-2
tests/encoder_decoder/test_e2e_correctness.py
tests/encoder_decoder/test_e2e_correctness.py
+3
-3
tests/kernels/test_attention_selector.py
tests/kernels/test_attention_selector.py
+2
-1
tests/models/decoder_only/language/test_phimoe.py
tests/models/decoder_only/language/test_phimoe.py
+2
-2
tests/models/decoder_only/vision_language/test_fuyu.py
tests/models/decoder_only/vision_language/test_fuyu.py
+3
-3
tests/models/decoder_only/vision_language/test_internvl.py
tests/models/decoder_only/vision_language/test_internvl.py
+3
-3
tests/models/decoder_only/vision_language/test_phi3v.py
tests/models/decoder_only/vision_language/test_phi3v.py
+3
-2
tests/models/utils.py
tests/models/utils.py
+4
-4
tests/worker/test_encoder_decoder_model_runner.py
tests/worker/test_encoder_decoder_model_runner.py
+6
-5
vllm/attention/backends/torch_sdpa.py
vllm/attention/backends/torch_sdpa.py
+4
-4
vllm/attention/ops/blocksparse_attention/interface.py
vllm/attention/ops/blocksparse_attention/interface.py
+10
-10
vllm/attention/selector.py
vllm/attention/selector.py
+3
-3
vllm/distributed/parallel_state.py
vllm/distributed/parallel_state.py
+3
-3
vllm/model_executor/custom_op.py
vllm/model_executor/custom_op.py
+2
-2
vllm/model_executor/models/qwen2_vl.py
vllm/model_executor/models/qwen2_vl.py
+4
-4
vllm/model_executor/models/utils.py
vllm/model_executor/models/utils.py
+3
-3
vllm/utils.py
vllm/utils.py
+1
-10
No files found.
tests/conftest.py
View file @
3ddbe255
...
...
@@ -32,9 +32,10 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
to_enc_dec_tuple_list
,
zip_enc_dec_prompts
)
from
vllm.logger
import
init_logger
from
vllm.outputs
import
RequestOutput
from
vllm.platforms
import
current_platform
from
vllm.sampling_params
import
BeamSearchParams
from
vllm.utils
import
(
STR_DTYPE_TO_TORCH_DTYPE
,
cuda_device_count_stateless
,
identity
,
is_cpu
)
identity
)
logger
=
init_logger
(
__name__
)
...
...
@@ -236,7 +237,8 @@ class HfRunner:
def
wrap_device
(
self
,
input
:
_T
,
device
:
Optional
[
str
]
=
None
)
->
_T
:
if
device
is
None
:
return
self
.
wrap_device
(
input
,
"cpu"
if
is_cpu
()
else
"cuda"
)
return
self
.
wrap_device
(
input
,
"cpu"
if
current_platform
.
is_cpu
()
else
"cuda"
)
if
hasattr
(
input
,
"device"
)
and
input
.
device
.
type
==
device
:
return
input
...
...
tests/encoder_decoder/test_e2e_correctness.py
View file @
3ddbe255
...
...
@@ -7,8 +7,8 @@ from typing import List, Optional, Tuple
import
pytest
from
transformers
import
AutoModelForSeq2SeqLM
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
is_cpu
from
..conftest
import
DecoderPromptType
from
..models.utils
import
check_logprobs_close
...
...
@@ -35,7 +35,7 @@ def vllm_to_hf_output(
@
pytest
.
mark
.
parametrize
(
"decoder_prompt_type"
,
list
(
DecoderPromptType
))
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
True
,
False
])
@
pytest
.
mark
.
skipif
(
is_cpu
(),
current_platform
.
is_cpu
(),
reason
=
"CPU backend is not currently supported with encoder/decoder models"
)
def
test_encoder_decoder_e2e
(
...
...
tests/kernels/test_attention_selector.py
View file @
3ddbe255
...
...
@@ -19,7 +19,8 @@ def test_env(name: str, device: str, monkeypatch):
override_backend_env_variable
(
monkeypatch
,
name
)
if
device
==
"cpu"
:
with
patch
(
"vllm.attention.selector.is_cpu"
,
return_value
=
True
):
with
patch
(
"vllm.attention.selector.current_platform.is_cpu"
,
return_value
=
True
):
backend
=
which_attn_to_use
(
16
,
torch
.
float16
,
torch
.
float16
,
16
,
False
)
assert
backend
.
name
==
"TORCH_SDPA"
...
...
tests/models/decoder_only/language/test_phimoe.py
View file @
3ddbe255
...
...
@@ -5,7 +5,7 @@ Run `pytest tests/models/test_phimoe.py`.
import
pytest
import
torch
from
vllm.
util
s
import
is_cpu
from
vllm.
platform
s
import
current_platform
from
....utils
import
large_gpu_test
from
...utils
import
check_logprobs_close
...
...
@@ -70,7 +70,7 @@ def test_phimoe_routing_function():
assert
torch
.
equal
(
topk_ids
,
ground_truth
[
test_id
][
"topk_ids"
])
@
pytest
.
mark
.
skipif
(
condition
=
is_cpu
(),
@
pytest
.
mark
.
skipif
(
condition
=
current_platform
.
is_cpu
(),
reason
=
"This test takes a lot time to run on CPU, "
"and vllm CI's disk space is not enough for this model."
)
@
large_gpu_test
(
min_gb
=
80
)
...
...
tests/models/decoder_only/vision_language/test_fuyu.py
View file @
3ddbe255
...
...
@@ -3,8 +3,8 @@ from typing import List, Optional, Tuple, Type
import
pytest
from
vllm.multimodal.utils
import
rescale_image_size
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
is_cpu
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
,
_ImageAssets
from
...utils
import
check_logprobs_close
...
...
@@ -103,7 +103,7 @@ def run_test(
target_dtype
=
"half"
if
is_cpu
():
if
current_platform
.
is_cpu
():
target_dtype
=
"bfloat16"
...
...
tests/models/decoder_only/vision_language/test_internvl.py
View file @
3ddbe255
...
...
@@ -7,7 +7,7 @@ from PIL.Image import Image
from
transformers
import
AutoConfig
from
vllm.multimodal.utils
import
rescale_image_size
from
vllm.
util
s
import
is_cpu
from
vllm.
platform
s
import
current_platform
from
....conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
,
_ImageAssets
)
...
...
@@ -244,7 +244,7 @@ def run_awq_test(
target_dtype
=
"half"
if
is_cpu
():
if
current_platform
.
is_cpu
():
target_dtype
=
"bfloat16"
...
...
tests/models/decoder_only/vision_language/test_phi3v.py
View file @
3ddbe255
...
...
@@ -10,8 +10,9 @@ from vllm.inputs import InputContext, token_inputs
from
vllm.model_executor.models.phi3v
import
_IMAGE_TOKEN_ID
from
vllm.multimodal
import
MultiModalRegistry
from
vllm.multimodal.utils
import
rescale_image_size
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
is_cpu
,
is_hip
from
vllm.utils
import
is_hip
from
....conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
,
_ImageAssets
)
...
...
@@ -49,7 +50,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
target_dtype
=
"half"
if
is_cpu
():
if
current_platform
.
is_cpu
():
target_dtype
=
"bfloat16"
# ROCm Triton FA can run into shared memory issues with these models,
...
...
tests/models/utils.py
View file @
3ddbe255
...
...
@@ -5,8 +5,8 @@ import torch
from
vllm.config
import
ModelConfig
,
TaskOption
from
vllm.inputs
import
InputContext
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
Logprob
,
PromptLogprobs
,
SampleLogprobs
from
vllm.utils
import
is_cpu
TokensText
=
Tuple
[
List
[
int
],
str
]
...
...
@@ -270,7 +270,7 @@ def build_model_context(model_name: str,
if
tokenizer_name
is
None
:
tokenizer_name
=
model_name
if
dtype
is
None
:
dtype
=
"bfloat16"
if
is_cpu
()
else
"half"
dtype
=
"bfloat16"
if
current_platform
.
is_cpu
()
else
"half"
model_config
=
ModelConfig
(
model_name
,
...
...
tests/worker/test_encoder_decoder_model_runner.py
View file @
3ddbe255
...
...
@@ -5,8 +5,9 @@ import pytest
import
torch
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
SamplingParams
,
SequenceData
,
SequenceGroupMetadata
from
vllm.utils
import
is_cpu
,
make_tensor_with_pad
from
vllm.utils
import
make_tensor_with_pad
from
vllm.worker.enc_dec_model_runner
import
EncoderDecoderModelRunner
from
vllm.worker.model_runner
import
_get_graph_batch_size
...
...
@@ -31,7 +32,7 @@ def _create_model_runner(model: str, *args,
return
model_runner
@
pytest
.
mark
.
skipif
(
condition
=
is_cpu
(),
@
pytest
.
mark
.
skipif
(
condition
=
current_platform
.
is_cpu
(),
reason
=
"CPU backend is currently "
"unsupported for encoder/ "
"decoder models"
)
...
...
@@ -74,7 +75,7 @@ def test_empty_seq_group():
assert
return_seq_lens
is
None
@
pytest
.
mark
.
skipif
(
condition
=
is_cpu
(),
@
pytest
.
mark
.
skipif
(
condition
=
current_platform
.
is_cpu
(),
reason
=
"CPU backend is currently "
"unsupported for encoder/ "
"decoder models"
)
...
...
@@ -264,7 +265,7 @@ def test_prepare_prompt(batch_size):
assert
torch
.
equal
(
actual
,
expected
)
@
pytest
.
mark
.
skipif
(
condition
=
is_cpu
(),
@
pytest
.
mark
.
skipif
(
condition
=
current_platform
.
is_cpu
(),
reason
=
"CPU backend is currently "
"unsupported for encoder/ "
"decoder models"
)
...
...
vllm/attention/backends/torch_sdpa.py
View file @
3ddbe255
...
...
@@ -10,9 +10,9 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
AttentionMetadata
,
AttentionType
)
from
vllm.attention.backends.utils
import
CommonAttentionState
from
vllm.attention.ops.paged_attn
import
PagedAttentionMetadata
from
vllm.
util
s
import
is_cpu
from
vllm.
platform
s
import
current_platform
if
is_cpu
():
if
current_platform
.
is_cpu
():
try
:
from
vllm.attention.ops.ipex_attn
import
PagedAttention
except
ImportError
:
...
...
vllm/attention/ops/blocksparse_attention/interface.py
View file @
3ddbe255
...
...
@@ -3,7 +3,7 @@ import math
import
torch
from
vllm.platforms
import
current_platform
from
vllm.utils
import
is_cpu
,
is_hip
from
vllm.utils
import
is_hip
from
.utils
import
(
dense_to_crow_col
,
get_head_sliding_step
,
get_sparse_attn_mask
)
...
...
@@ -32,7 +32,7 @@ class LocalStridedBlockSparseAttn(torch.nn.Module):
):
super
().
__init__
()
if
use_spda
is
None
:
use_spda
=
is_hip
()
or
is_cpu
()
or
not
\
use_spda
=
is_hip
()
or
current_platform
.
is_cpu
()
or
not
\
IS_COMPUTE_8_OR_ABOVE
device
=
device
or
(
torch
.
cuda
.
current_device
()
if
current_platform
.
is_cuda_alike
()
else
"cpu"
)
...
...
vllm/attention/selector.py
View file @
3ddbe255
...
...
@@ -10,7 +10,7 @@ import vllm.envs as envs
from
vllm.attention.backends.abstract
import
AttentionBackend
from
vllm.logger
import
init_logger
from
vllm.platforms
import
current_platform
from
vllm.utils
import
STR_BACKEND_ENV_VAR
,
is_cpu
,
is_hip
,
is_openvino
,
is_xpu
from
vllm.utils
import
STR_BACKEND_ENV_VAR
,
is_hip
,
is_openvino
,
is_xpu
logger
=
init_logger
(
__name__
)
...
...
@@ -121,7 +121,7 @@ def get_attn_backend(
ROCmFlashAttentionBackend
)
return
ROCmFlashAttentionBackend
elif
backend
==
_Backend
.
TORCH_SDPA
:
assert
is_cpu
(),
RuntimeError
(
assert
current_platform
.
is_cpu
(),
RuntimeError
(
"Torch SDPA backend is only used for the CPU device."
)
logger
.
info
(
"Using Torch SDPA backend."
)
from
vllm.attention.backends.torch_sdpa
import
TorchSDPABackend
...
...
@@ -183,7 +183,7 @@ def which_attn_to_use(
if
backend_by_env_var
is
not
None
:
selected_backend
=
backend_name_to_enum
(
backend_by_env_var
)
if
is_cpu
():
if
current_platform
.
is_cpu
():
if
selected_backend
!=
_Backend
.
TORCH_SDPA
:
logger
.
info
(
"Cannot use %s backend on CPU."
,
selected_backend
)
return
_Backend
.
TORCH_SDPA
...
...
vllm/distributed/parallel_state.py
View file @
3ddbe255
...
...
@@ -37,7 +37,7 @@ from torch.distributed import Backend, ProcessGroup
import
vllm.envs
as
envs
from
vllm.logger
import
init_logger
from
vllm.platforms
import
current_platform
from
vllm.utils
import
is_cpu
,
supports_custom_op
from
vllm.utils
import
supports_custom_op
@
dataclass
...
...
@@ -1139,7 +1139,7 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
import
ray
# Lazy import Ray
ray
.
shutdown
()
gc
.
collect
()
if
not
is_cpu
():
if
not
current_platform
.
is_cpu
():
torch
.
cuda
.
empty_cache
()
...
...
vllm/model_executor/custom_op.py
View file @
3ddbe255
...
...
@@ -7,7 +7,7 @@ import vllm.envs as envs
from
vllm.compilation.levels
import
CompilationLevel
from
vllm.logger
import
init_logger
from
vllm.platforms
import
current_platform
from
vllm.utils
import
is_cpu
,
is_hip
,
is_xpu
,
print_warning_once
from
vllm.utils
import
is_hip
,
is_xpu
,
print_warning_once
logger
=
init_logger
(
__name__
)
...
...
@@ -74,7 +74,7 @@ class CustomOp(nn.Module):
if
is_hip
():
return
self
.
forward_hip
elif
is_cpu
():
elif
current_platform
.
is_cpu
():
return
self
.
forward_cpu
elif
current_platform
.
is_tpu
():
return
self
.
forward_tpu
...
...
vllm/model_executor/models/qwen2_vl.py
View file @
3ddbe255
vllm/model_executor/models/utils.py
View file @
3ddbe255
...
...
@@ -21,7 +21,7 @@ from vllm.model_executor.models import ModelRegistry
from
vllm.multimodal.base
import
NestedTensors
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils
import
is_cpu
,
is_pin_memory_available
from
vllm.utils
import
is_pin_memory_available
logger
=
init_logger
(
__name__
)
...
...
@@ -515,7 +515,7 @@ def get_vit_attn_backend() -> _Backend:
"so we use xformers backend instead. You can run "
"`pip install flash-attn` to use flash-attention backend."
)
selected_backend
=
_Backend
.
XFORMERS
elif
is_cpu
():
elif
current_platform
.
is_cpu
():
selected_backend
=
_Backend
.
TORCH_SDPA
else
:
selected_backend
=
_Backend
.
XFORMERS
...
...
vllm/utils.py
View file @
3ddbe255
...
...
@@ -318,15 +318,6 @@ def is_hip() -> bool:
return
torch
.
version
.
hip
is
not
None
@
lru_cache
(
maxsize
=
None
)
def
is_cpu
()
->
bool
:
from
importlib.metadata
import
PackageNotFoundError
,
version
try
:
return
"cpu"
in
version
(
"vllm"
)
except
PackageNotFoundError
:
return
False
@
lru_cache
(
maxsize
=
None
)
def
is_openvino
()
->
bool
:
from
importlib.metadata
import
PackageNotFoundError
,
version
...
...
@@ -798,7 +789,7 @@ def is_pin_memory_available() -> bool:
elif
is_neuron
():
print_warning_once
(
"Pin memory is not supported on Neuron."
)
return
False
elif
is_cpu
()
or
is_openvino
():
elif
current_platform
.
is_cpu
()
or
is_openvino
():
return
False
return
True
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment