Unverified Commit 3ddbe255 authored by wangshuai09's avatar wangshuai09 Committed by GitHub
Browse files

[Hardware][CPU] using current_platform.is_cpu (#9536)

parent 0d02747f
...@@ -32,9 +32,10 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, ...@@ -32,9 +32,10 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
to_enc_dec_tuple_list, zip_enc_dec_prompts) to_enc_dec_tuple_list, zip_enc_dec_prompts)
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.platforms import current_platform
from vllm.sampling_params import BeamSearchParams from vllm.sampling_params import BeamSearchParams
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless, from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
identity, is_cpu) identity)
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -236,7 +237,8 @@ class HfRunner: ...@@ -236,7 +237,8 @@ class HfRunner:
def wrap_device(self, input: _T, device: Optional[str] = None) -> _T: def wrap_device(self, input: _T, device: Optional[str] = None) -> _T:
if device is None: if device is None:
return self.wrap_device(input, "cpu" if is_cpu() else "cuda") return self.wrap_device(
input, "cpu" if current_platform.is_cpu() else "cuda")
if hasattr(input, "device") and input.device.type == device: if hasattr(input, "device") and input.device.type == device:
return input return input
......
...@@ -7,8 +7,8 @@ from typing import List, Optional, Tuple ...@@ -7,8 +7,8 @@ from typing import List, Optional, Tuple
import pytest import pytest
from transformers import AutoModelForSeq2SeqLM from transformers import AutoModelForSeq2SeqLM
from vllm.platforms import current_platform
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.utils import is_cpu
from ..conftest import DecoderPromptType from ..conftest import DecoderPromptType
from ..models.utils import check_logprobs_close from ..models.utils import check_logprobs_close
...@@ -35,7 +35,7 @@ def vllm_to_hf_output( ...@@ -35,7 +35,7 @@ def vllm_to_hf_output(
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
@pytest.mark.parametrize("enforce_eager", [True, False]) @pytest.mark.parametrize("enforce_eager", [True, False])
@pytest.mark.skipif( @pytest.mark.skipif(
is_cpu(), current_platform.is_cpu(),
reason="CPU backend is not currently supported with encoder/decoder models" reason="CPU backend is not currently supported with encoder/decoder models"
) )
def test_encoder_decoder_e2e( def test_encoder_decoder_e2e(
......
...@@ -19,7 +19,8 @@ def test_env(name: str, device: str, monkeypatch): ...@@ -19,7 +19,8 @@ def test_env(name: str, device: str, monkeypatch):
override_backend_env_variable(monkeypatch, name) override_backend_env_variable(monkeypatch, name)
if device == "cpu": if device == "cpu":
with patch("vllm.attention.selector.is_cpu", return_value=True): with patch("vllm.attention.selector.current_platform.is_cpu",
return_value=True):
backend = which_attn_to_use(16, torch.float16, torch.float16, 16, backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
False) False)
assert backend.name == "TORCH_SDPA" assert backend.name == "TORCH_SDPA"
......
...@@ -5,7 +5,7 @@ Run `pytest tests/models/test_phimoe.py`. ...@@ -5,7 +5,7 @@ Run `pytest tests/models/test_phimoe.py`.
import pytest import pytest
import torch import torch
from vllm.utils import is_cpu from vllm.platforms import current_platform
from ....utils import large_gpu_test from ....utils import large_gpu_test
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
...@@ -70,7 +70,7 @@ def test_phimoe_routing_function(): ...@@ -70,7 +70,7 @@ def test_phimoe_routing_function():
assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"]) assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])
@pytest.mark.skipif(condition=is_cpu(), @pytest.mark.skipif(condition=current_platform.is_cpu(),
reason="This test takes a lot time to run on CPU, " reason="This test takes a lot time to run on CPU, "
"and vllm CI's disk space is not enough for this model.") "and vllm CI's disk space is not enough for this model.")
@large_gpu_test(min_gb=80) @large_gpu_test(min_gb=80)
......
...@@ -3,8 +3,8 @@ from typing import List, Optional, Tuple, Type ...@@ -3,8 +3,8 @@ from typing import List, Optional, Tuple, Type
import pytest import pytest
from vllm.multimodal.utils import rescale_image_size from vllm.multimodal.utils import rescale_image_size
from vllm.platforms import current_platform
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.utils import is_cpu
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
...@@ -103,7 +103,7 @@ def run_test( ...@@ -103,7 +103,7 @@ def run_test(
target_dtype = "half" target_dtype = "half"
if is_cpu(): if current_platform.is_cpu():
target_dtype = "bfloat16" target_dtype = "bfloat16"
......
...@@ -7,7 +7,7 @@ from PIL.Image import Image ...@@ -7,7 +7,7 @@ from PIL.Image import Image
from transformers import AutoConfig from transformers import AutoConfig
from vllm.multimodal.utils import rescale_image_size from vllm.multimodal.utils import rescale_image_size
from vllm.utils import is_cpu from vllm.platforms import current_platform
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
_ImageAssets) _ImageAssets)
...@@ -244,7 +244,7 @@ def run_awq_test( ...@@ -244,7 +244,7 @@ def run_awq_test(
target_dtype = "half" target_dtype = "half"
if is_cpu(): if current_platform.is_cpu():
target_dtype = "bfloat16" target_dtype = "bfloat16"
......
...@@ -10,8 +10,9 @@ from vllm.inputs import InputContext, token_inputs ...@@ -10,8 +10,9 @@ from vllm.inputs import InputContext, token_inputs
from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
from vllm.multimodal import MultiModalRegistry from vllm.multimodal import MultiModalRegistry
from vllm.multimodal.utils import rescale_image_size from vllm.multimodal.utils import rescale_image_size
from vllm.platforms import current_platform
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.utils import is_cpu, is_hip from vllm.utils import is_hip
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
_ImageAssets) _ImageAssets)
...@@ -49,7 +50,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str, ...@@ -49,7 +50,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
target_dtype = "half" target_dtype = "half"
if is_cpu(): if current_platform.is_cpu():
target_dtype = "bfloat16" target_dtype = "bfloat16"
# ROCm Triton FA can run into shared memory issues with these models, # ROCm Triton FA can run into shared memory issues with these models,
......
...@@ -5,8 +5,8 @@ import torch ...@@ -5,8 +5,8 @@ import torch
from vllm.config import ModelConfig, TaskOption from vllm.config import ModelConfig, TaskOption
from vllm.inputs import InputContext from vllm.inputs import InputContext
from vllm.platforms import current_platform
from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
from vllm.utils import is_cpu
TokensText = Tuple[List[int], str] TokensText = Tuple[List[int], str]
...@@ -270,7 +270,7 @@ def build_model_context(model_name: str, ...@@ -270,7 +270,7 @@ def build_model_context(model_name: str,
if tokenizer_name is None: if tokenizer_name is None:
tokenizer_name = model_name tokenizer_name = model_name
if dtype is None: if dtype is None:
dtype = "bfloat16" if is_cpu() else "half" dtype = "bfloat16" if current_platform.is_cpu() else "half"
model_config = ModelConfig( model_config = ModelConfig(
model_name, model_name,
......
...@@ -5,8 +5,9 @@ import pytest ...@@ -5,8 +5,9 @@ import pytest
import torch import torch
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.platforms import current_platform
from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
from vllm.utils import is_cpu, make_tensor_with_pad from vllm.utils import make_tensor_with_pad
from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
from vllm.worker.model_runner import _get_graph_batch_size from vllm.worker.model_runner import _get_graph_batch_size
...@@ -31,7 +32,7 @@ def _create_model_runner(model: str, *args, ...@@ -31,7 +32,7 @@ def _create_model_runner(model: str, *args,
return model_runner return model_runner
@pytest.mark.skipif(condition=is_cpu(), @pytest.mark.skipif(condition=current_platform.is_cpu(),
reason="CPU backend is currently " reason="CPU backend is currently "
"unsupported for encoder/ " "unsupported for encoder/ "
"decoder models") "decoder models")
...@@ -74,7 +75,7 @@ def test_empty_seq_group(): ...@@ -74,7 +75,7 @@ def test_empty_seq_group():
assert return_seq_lens is None assert return_seq_lens is None
@pytest.mark.skipif(condition=is_cpu(), @pytest.mark.skipif(condition=current_platform.is_cpu(),
reason="CPU backend is currently " reason="CPU backend is currently "
"unsupported for encoder/ " "unsupported for encoder/ "
"decoder models") "decoder models")
...@@ -264,7 +265,7 @@ def test_prepare_prompt(batch_size): ...@@ -264,7 +265,7 @@ def test_prepare_prompt(batch_size):
assert torch.equal(actual, expected) assert torch.equal(actual, expected)
@pytest.mark.skipif(condition=is_cpu(), @pytest.mark.skipif(condition=current_platform.is_cpu(),
reason="CPU backend is currently " reason="CPU backend is currently "
"unsupported for encoder/ " "unsupported for encoder/ "
"decoder models") "decoder models")
......
...@@ -10,9 +10,9 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, ...@@ -10,9 +10,9 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
AttentionMetadata, AttentionType) AttentionMetadata, AttentionType)
from vllm.attention.backends.utils import CommonAttentionState from vllm.attention.backends.utils import CommonAttentionState
from vllm.attention.ops.paged_attn import PagedAttentionMetadata from vllm.attention.ops.paged_attn import PagedAttentionMetadata
from vllm.utils import is_cpu from vllm.platforms import current_platform
if is_cpu(): if current_platform.is_cpu():
try: try:
from vllm.attention.ops.ipex_attn import PagedAttention from vllm.attention.ops.ipex_attn import PagedAttention
except ImportError: except ImportError:
......
...@@ -3,7 +3,7 @@ import math ...@@ -3,7 +3,7 @@ import math
import torch import torch
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import is_cpu, is_hip from vllm.utils import is_hip
from .utils import (dense_to_crow_col, get_head_sliding_step, from .utils import (dense_to_crow_col, get_head_sliding_step,
get_sparse_attn_mask) get_sparse_attn_mask)
...@@ -32,7 +32,7 @@ class LocalStridedBlockSparseAttn(torch.nn.Module): ...@@ -32,7 +32,7 @@ class LocalStridedBlockSparseAttn(torch.nn.Module):
): ):
super().__init__() super().__init__()
if use_spda is None: if use_spda is None:
use_spda = is_hip() or is_cpu() or not \ use_spda = is_hip() or current_platform.is_cpu() or not \
IS_COMPUTE_8_OR_ABOVE IS_COMPUTE_8_OR_ABOVE
device = device or (torch.cuda.current_device() device = device or (torch.cuda.current_device()
if current_platform.is_cuda_alike() else "cpu") if current_platform.is_cuda_alike() else "cpu")
......
...@@ -10,7 +10,7 @@ import vllm.envs as envs ...@@ -10,7 +10,7 @@ import vllm.envs as envs
from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.backends.abstract import AttentionBackend
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import STR_BACKEND_ENV_VAR, is_cpu, is_hip, is_openvino, is_xpu from vllm.utils import STR_BACKEND_ENV_VAR, is_hip, is_openvino, is_xpu
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -121,7 +121,7 @@ def get_attn_backend( ...@@ -121,7 +121,7 @@ def get_attn_backend(
ROCmFlashAttentionBackend) ROCmFlashAttentionBackend)
return ROCmFlashAttentionBackend return ROCmFlashAttentionBackend
elif backend == _Backend.TORCH_SDPA: elif backend == _Backend.TORCH_SDPA:
assert is_cpu(), RuntimeError( assert current_platform.is_cpu(), RuntimeError(
"Torch SDPA backend is only used for the CPU device.") "Torch SDPA backend is only used for the CPU device.")
logger.info("Using Torch SDPA backend.") logger.info("Using Torch SDPA backend.")
from vllm.attention.backends.torch_sdpa import TorchSDPABackend from vllm.attention.backends.torch_sdpa import TorchSDPABackend
...@@ -183,7 +183,7 @@ def which_attn_to_use( ...@@ -183,7 +183,7 @@ def which_attn_to_use(
if backend_by_env_var is not None: if backend_by_env_var is not None:
selected_backend = backend_name_to_enum(backend_by_env_var) selected_backend = backend_name_to_enum(backend_by_env_var)
if is_cpu(): if current_platform.is_cpu():
if selected_backend != _Backend.TORCH_SDPA: if selected_backend != _Backend.TORCH_SDPA:
logger.info("Cannot use %s backend on CPU.", selected_backend) logger.info("Cannot use %s backend on CPU.", selected_backend)
return _Backend.TORCH_SDPA return _Backend.TORCH_SDPA
......
...@@ -37,7 +37,7 @@ from torch.distributed import Backend, ProcessGroup ...@@ -37,7 +37,7 @@ from torch.distributed import Backend, ProcessGroup
import vllm.envs as envs import vllm.envs as envs
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import is_cpu, supports_custom_op from vllm.utils import supports_custom_op
@dataclass @dataclass
...@@ -1139,7 +1139,7 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False): ...@@ -1139,7 +1139,7 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
import ray # Lazy import Ray import ray # Lazy import Ray
ray.shutdown() ray.shutdown()
gc.collect() gc.collect()
if not is_cpu(): if not current_platform.is_cpu():
torch.cuda.empty_cache() torch.cuda.empty_cache()
......
...@@ -7,7 +7,7 @@ import vllm.envs as envs ...@@ -7,7 +7,7 @@ import vllm.envs as envs
from vllm.compilation.levels import CompilationLevel from vllm.compilation.levels import CompilationLevel
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import is_cpu, is_hip, is_xpu, print_warning_once from vllm.utils import is_hip, is_xpu, print_warning_once
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -74,7 +74,7 @@ class CustomOp(nn.Module): ...@@ -74,7 +74,7 @@ class CustomOp(nn.Module):
if is_hip(): if is_hip():
return self.forward_hip return self.forward_hip
elif is_cpu(): elif current_platform.is_cpu():
return self.forward_cpu return self.forward_cpu
elif current_platform.is_tpu(): elif current_platform.is_tpu():
return self.forward_tpu return self.forward_tpu
......
...@@ -21,7 +21,7 @@ from vllm.model_executor.models import ModelRegistry ...@@ -21,7 +21,7 @@ from vllm.model_executor.models import ModelRegistry
from vllm.multimodal.base import NestedTensors from vllm.multimodal.base import NestedTensors
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.utils import is_cpu, is_pin_memory_available from vllm.utils import is_pin_memory_available
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -515,7 +515,7 @@ def get_vit_attn_backend() -> _Backend: ...@@ -515,7 +515,7 @@ def get_vit_attn_backend() -> _Backend:
"so we use xformers backend instead. You can run " "so we use xformers backend instead. You can run "
"`pip install flash-attn` to use flash-attention backend.") "`pip install flash-attn` to use flash-attention backend.")
selected_backend = _Backend.XFORMERS selected_backend = _Backend.XFORMERS
elif is_cpu(): elif current_platform.is_cpu():
selected_backend = _Backend.TORCH_SDPA selected_backend = _Backend.TORCH_SDPA
else: else:
selected_backend = _Backend.XFORMERS selected_backend = _Backend.XFORMERS
......
...@@ -318,15 +318,6 @@ def is_hip() -> bool: ...@@ -318,15 +318,6 @@ def is_hip() -> bool:
return torch.version.hip is not None return torch.version.hip is not None
@lru_cache(maxsize=None)
def is_cpu() -> bool:
from importlib.metadata import PackageNotFoundError, version
try:
return "cpu" in version("vllm")
except PackageNotFoundError:
return False
@lru_cache(maxsize=None) @lru_cache(maxsize=None)
def is_openvino() -> bool: def is_openvino() -> bool:
from importlib.metadata import PackageNotFoundError, version from importlib.metadata import PackageNotFoundError, version
...@@ -798,7 +789,7 @@ def is_pin_memory_available() -> bool: ...@@ -798,7 +789,7 @@ def is_pin_memory_available() -> bool:
elif is_neuron(): elif is_neuron():
print_warning_once("Pin memory is not supported on Neuron.") print_warning_once("Pin memory is not supported on Neuron.")
return False return False
elif is_cpu() or is_openvino(): elif current_platform.is_cpu() or is_openvino():
return False return False
return True return True
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment