Unverified Commit 3ddbe255 authored by wangshuai09's avatar wangshuai09 Committed by GitHub
Browse files

[Hardware][CPU] using current_platform.is_cpu (#9536)

parent 0d02747f
......@@ -32,9 +32,10 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
to_enc_dec_tuple_list, zip_enc_dec_prompts)
from vllm.logger import init_logger
from vllm.outputs import RequestOutput
from vllm.platforms import current_platform
from vllm.sampling_params import BeamSearchParams
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
identity, is_cpu)
identity)
logger = init_logger(__name__)
......@@ -236,7 +237,8 @@ class HfRunner:
def wrap_device(self, input: _T, device: Optional[str] = None) -> _T:
if device is None:
return self.wrap_device(input, "cpu" if is_cpu() else "cuda")
return self.wrap_device(
input, "cpu" if current_platform.is_cpu() else "cuda")
if hasattr(input, "device") and input.device.type == device:
return input
......
......@@ -7,8 +7,8 @@ from typing import List, Optional, Tuple
import pytest
from transformers import AutoModelForSeq2SeqLM
from vllm.platforms import current_platform
from vllm.sequence import SampleLogprobs
from vllm.utils import is_cpu
from ..conftest import DecoderPromptType
from ..models.utils import check_logprobs_close
......@@ -35,7 +35,7 @@ def vllm_to_hf_output(
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
@pytest.mark.parametrize("enforce_eager", [True, False])
@pytest.mark.skipif(
is_cpu(),
current_platform.is_cpu(),
reason="CPU backend is not currently supported with encoder/decoder models"
)
def test_encoder_decoder_e2e(
......
......@@ -19,7 +19,8 @@ def test_env(name: str, device: str, monkeypatch):
override_backend_env_variable(monkeypatch, name)
if device == "cpu":
with patch("vllm.attention.selector.is_cpu", return_value=True):
with patch("vllm.attention.selector.current_platform.is_cpu",
return_value=True):
backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
False)
assert backend.name == "TORCH_SDPA"
......
......@@ -5,7 +5,7 @@ Run `pytest tests/models/test_phimoe.py`.
import pytest
import torch
from vllm.utils import is_cpu
from vllm.platforms import current_platform
from ....utils import large_gpu_test
from ...utils import check_logprobs_close
......@@ -70,7 +70,7 @@ def test_phimoe_routing_function():
assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])
@pytest.mark.skipif(condition=is_cpu(),
@pytest.mark.skipif(condition=current_platform.is_cpu(),
reason="This test takes a lot time to run on CPU, "
"and vllm CI's disk space is not enough for this model.")
@large_gpu_test(min_gb=80)
......
......@@ -3,8 +3,8 @@ from typing import List, Optional, Tuple, Type
import pytest
from vllm.multimodal.utils import rescale_image_size
from vllm.platforms import current_platform
from vllm.sequence import SampleLogprobs
from vllm.utils import is_cpu
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
from ...utils import check_logprobs_close
......@@ -103,7 +103,7 @@ def run_test(
target_dtype = "half"
if is_cpu():
if current_platform.is_cpu():
target_dtype = "bfloat16"
......
......@@ -7,7 +7,7 @@ from PIL.Image import Image
from transformers import AutoConfig
from vllm.multimodal.utils import rescale_image_size
from vllm.utils import is_cpu
from vllm.platforms import current_platform
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
_ImageAssets)
......@@ -244,7 +244,7 @@ def run_awq_test(
target_dtype = "half"
if is_cpu():
if current_platform.is_cpu():
target_dtype = "bfloat16"
......
......@@ -10,8 +10,9 @@ from vllm.inputs import InputContext, token_inputs
from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
from vllm.multimodal import MultiModalRegistry
from vllm.multimodal.utils import rescale_image_size
from vllm.platforms import current_platform
from vllm.sequence import SampleLogprobs
from vllm.utils import is_cpu, is_hip
from vllm.utils import is_hip
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
_ImageAssets)
......@@ -49,7 +50,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
target_dtype = "half"
if is_cpu():
if current_platform.is_cpu():
target_dtype = "bfloat16"
# ROCm Triton FA can run into shared memory issues with these models,
......
......@@ -5,8 +5,8 @@ import torch
from vllm.config import ModelConfig, TaskOption
from vllm.inputs import InputContext
from vllm.platforms import current_platform
from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
from vllm.utils import is_cpu
TokensText = Tuple[List[int], str]
......@@ -270,7 +270,7 @@ def build_model_context(model_name: str,
if tokenizer_name is None:
tokenizer_name = model_name
if dtype is None:
dtype = "bfloat16" if is_cpu() else "half"
dtype = "bfloat16" if current_platform.is_cpu() else "half"
model_config = ModelConfig(
model_name,
......
......@@ -5,8 +5,9 @@ import pytest
import torch
from vllm.engine.arg_utils import EngineArgs
from vllm.platforms import current_platform
from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
from vllm.utils import is_cpu, make_tensor_with_pad
from vllm.utils import make_tensor_with_pad
from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
from vllm.worker.model_runner import _get_graph_batch_size
......@@ -31,7 +32,7 @@ def _create_model_runner(model: str, *args,
return model_runner
@pytest.mark.skipif(condition=is_cpu(),
@pytest.mark.skipif(condition=current_platform.is_cpu(),
reason="CPU backend is currently "
"unsupported for encoder/ "
"decoder models")
......@@ -74,7 +75,7 @@ def test_empty_seq_group():
assert return_seq_lens is None
@pytest.mark.skipif(condition=is_cpu(),
@pytest.mark.skipif(condition=current_platform.is_cpu(),
reason="CPU backend is currently "
"unsupported for encoder/ "
"decoder models")
......@@ -264,7 +265,7 @@ def test_prepare_prompt(batch_size):
assert torch.equal(actual, expected)
@pytest.mark.skipif(condition=is_cpu(),
@pytest.mark.skipif(condition=current_platform.is_cpu(),
reason="CPU backend is currently "
"unsupported for encoder/ "
"decoder models")
......
......@@ -10,9 +10,9 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
AttentionMetadata, AttentionType)
from vllm.attention.backends.utils import CommonAttentionState
from vllm.attention.ops.paged_attn import PagedAttentionMetadata
from vllm.utils import is_cpu
from vllm.platforms import current_platform
if is_cpu():
if current_platform.is_cpu():
try:
from vllm.attention.ops.ipex_attn import PagedAttention
except ImportError:
......
......@@ -3,7 +3,7 @@ import math
import torch
from vllm.platforms import current_platform
from vllm.utils import is_cpu, is_hip
from vllm.utils import is_hip
from .utils import (dense_to_crow_col, get_head_sliding_step,
get_sparse_attn_mask)
......@@ -32,7 +32,7 @@ class LocalStridedBlockSparseAttn(torch.nn.Module):
):
super().__init__()
if use_spda is None:
use_spda = is_hip() or is_cpu() or not \
use_spda = is_hip() or current_platform.is_cpu() or not \
IS_COMPUTE_8_OR_ABOVE
device = device or (torch.cuda.current_device()
if current_platform.is_cuda_alike() else "cpu")
......
......@@ -10,7 +10,7 @@ import vllm.envs as envs
from vllm.attention.backends.abstract import AttentionBackend
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import STR_BACKEND_ENV_VAR, is_cpu, is_hip, is_openvino, is_xpu
from vllm.utils import STR_BACKEND_ENV_VAR, is_hip, is_openvino, is_xpu
logger = init_logger(__name__)
......@@ -121,7 +121,7 @@ def get_attn_backend(
ROCmFlashAttentionBackend)
return ROCmFlashAttentionBackend
elif backend == _Backend.TORCH_SDPA:
assert is_cpu(), RuntimeError(
assert current_platform.is_cpu(), RuntimeError(
"Torch SDPA backend is only used for the CPU device.")
logger.info("Using Torch SDPA backend.")
from vllm.attention.backends.torch_sdpa import TorchSDPABackend
......@@ -183,7 +183,7 @@ def which_attn_to_use(
if backend_by_env_var is not None:
selected_backend = backend_name_to_enum(backend_by_env_var)
if is_cpu():
if current_platform.is_cpu():
if selected_backend != _Backend.TORCH_SDPA:
logger.info("Cannot use %s backend on CPU.", selected_backend)
return _Backend.TORCH_SDPA
......
......@@ -37,7 +37,7 @@ from torch.distributed import Backend, ProcessGroup
import vllm.envs as envs
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import is_cpu, supports_custom_op
from vllm.utils import supports_custom_op
@dataclass
......@@ -1139,7 +1139,7 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
import ray # Lazy import Ray
ray.shutdown()
gc.collect()
if not is_cpu():
if not current_platform.is_cpu():
torch.cuda.empty_cache()
......
......@@ -7,7 +7,7 @@ import vllm.envs as envs
from vllm.compilation.levels import CompilationLevel
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import is_cpu, is_hip, is_xpu, print_warning_once
from vllm.utils import is_hip, is_xpu, print_warning_once
logger = init_logger(__name__)
......@@ -74,7 +74,7 @@ class CustomOp(nn.Module):
if is_hip():
return self.forward_hip
elif is_cpu():
elif current_platform.is_cpu():
return self.forward_cpu
elif current_platform.is_tpu():
return self.forward_tpu
......
......@@ -21,7 +21,7 @@ from vllm.model_executor.models import ModelRegistry
from vllm.multimodal.base import NestedTensors
from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors
from vllm.utils import is_cpu, is_pin_memory_available
from vllm.utils import is_pin_memory_available
logger = init_logger(__name__)
......@@ -515,7 +515,7 @@ def get_vit_attn_backend() -> _Backend:
"so we use xformers backend instead. You can run "
"`pip install flash-attn` to use flash-attention backend.")
selected_backend = _Backend.XFORMERS
elif is_cpu():
elif current_platform.is_cpu():
selected_backend = _Backend.TORCH_SDPA
else:
selected_backend = _Backend.XFORMERS
......
......@@ -318,15 +318,6 @@ def is_hip() -> bool:
return torch.version.hip is not None
@lru_cache(maxsize=None)
def is_cpu() -> bool:
from importlib.metadata import PackageNotFoundError, version
try:
return "cpu" in version("vllm")
except PackageNotFoundError:
return False
@lru_cache(maxsize=None)
def is_openvino() -> bool:
from importlib.metadata import PackageNotFoundError, version
......@@ -798,7 +789,7 @@ def is_pin_memory_available() -> bool:
elif is_neuron():
print_warning_once("Pin memory is not supported on Neuron.")
return False
elif is_cpu() or is_openvino():
elif current_platform.is_cpu() or is_openvino():
return False
return True
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment