[Hardware][CPU] using current_platform.is_cpu (#9536)

3ddbe255 · wangshuai09 · GitHub · 0d02747f · 3ddbe255 · 3ddbe255
Unverified Commit 3ddbe255 authored Oct 22, 2024 by wangshuai09 Committed by GitHub Oct 22, 2024
17 changed files
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -32,9 +32,10 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
                         to_enc_dec_tuple_list, zip_enc_dec_prompts)
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
+from vllm.platforms import current_platform
 from vllm.sampling_params import BeamSearchParams
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
-                        identity, is_cpu)
+                        identity)

 logger = init_logger(__name__)

@@ -236,7 +237,8 @@ class HfRunner:

    def wrap_device(self, input: _T, device: Optional[str] = None) -> _T:
        if device is None:
-            return self.wrap_device(input, "cpu" if is_cpu() else "cuda")
+            return self.wrap_device(
+                input, "cpu" if current_platform.is_cpu() else "cuda")

        if hasattr(input, "device") and input.device.type == device:
            return input

--- a/tests/encoder_decoder/test_e2e_correctness.py
+++ b/tests/encoder_decoder/test_e2e_correctness.py
@@ -7,8 +7,8 @@ from typing import List, Optional, Tuple
 import pytest
 from transformers import AutoModelForSeq2SeqLM

+from vllm.platforms import current_platform
 from vllm.sequence import SampleLogprobs
-from vllm.utils import is_cpu

 from ..conftest import DecoderPromptType
 from ..models.utils import check_logprobs_close
@@ -35,7 +35,7 @@ def vllm_to_hf_output(
 @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
 @pytest.mark.parametrize("enforce_eager", [True, False])
 @pytest.mark.skipif(
-    is_cpu(),
+    current_platform.is_cpu(),
    reason="CPU backend is not currently supported with encoder/decoder models"
 )
 def test_encoder_decoder_e2e(

--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -19,7 +19,8 @@ def test_env(name: str, device: str, monkeypatch):
    override_backend_env_variable(monkeypatch, name)

    if device == "cpu":
-        with patch("vllm.attention.selector.is_cpu", return_value=True):
+        with patch("vllm.attention.selector.current_platform.is_cpu",
+                   return_value=True):
            backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
                                        False)
        assert backend.name == "TORCH_SDPA"

--- a/tests/models/decoder_only/language/test_phimoe.py
+++ b/tests/models/decoder_only/language/test_phimoe.py
@@ -5,7 +5,7 @@ Run `pytest tests/models/test_phimoe.py`.
 import pytest
 import torch

-from vllm.utils import is_cpu
+from vllm.platforms import current_platform

 from ....utils import large_gpu_test
 from ...utils import check_logprobs_close
@@ -70,7 +70,7 @@ def test_phimoe_routing_function():
        assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])


-@pytest.mark.skipif(condition=is_cpu(),
+@pytest.mark.skipif(condition=current_platform.is_cpu(),
                    reason="This test takes a lot time to run on CPU, "
                    "and vllm CI's disk space is not enough for this model.")
 @large_gpu_test(min_gb=80)

--- a/tests/models/decoder_only/vision_language/test_fuyu.py
+++ b/tests/models/decoder_only/vision_language/test_fuyu.py
@@ -3,8 +3,8 @@ from typing import List, Optional, Tuple, Type
 import pytest

 from vllm.multimodal.utils import rescale_image_size
+from vllm.platforms import current_platform
 from vllm.sequence import SampleLogprobs
-from vllm.utils import is_cpu

 from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
 from ...utils import check_logprobs_close
@@ -103,7 +103,7 @@ def run_test(


 target_dtype = "half"
-if is_cpu():
+if current_platform.is_cpu():
    target_dtype = "bfloat16"



--- a/tests/models/decoder_only/vision_language/test_internvl.py
+++ b/tests/models/decoder_only/vision_language/test_internvl.py
@@ -7,7 +7,7 @@ from PIL.Image import Image
 from transformers import AutoConfig

 from vllm.multimodal.utils import rescale_image_size
-from vllm.utils import is_cpu
+from vllm.platforms import current_platform

 from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
                          _ImageAssets)
@@ -244,7 +244,7 @@ def run_awq_test(


 target_dtype = "half"
-if is_cpu():
+if current_platform.is_cpu():
    target_dtype = "bfloat16"



--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
@@ -10,8 +10,9 @@ from vllm.inputs import InputContext, token_inputs
 from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
 from vllm.multimodal import MultiModalRegistry
 from vllm.multimodal.utils import rescale_image_size
+from vllm.platforms import current_platform
 from vllm.sequence import SampleLogprobs
-from vllm.utils import is_cpu, is_hip
+from vllm.utils import is_hip

 from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
                          _ImageAssets)
@@ -49,7 +50,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,


 target_dtype = "half"
-if is_cpu():
+if current_platform.is_cpu():
    target_dtype = "bfloat16"

 # ROCm Triton FA can run into shared memory issues with these models,

--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -5,8 +5,8 @@ import torch

 from vllm.config import ModelConfig, TaskOption
 from vllm.inputs import InputContext
+from vllm.platforms import current_platform
 from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
-from vllm.utils import is_cpu

 TokensText = Tuple[List[int], str]

@@ -270,7 +270,7 @@ def build_model_context(model_name: str,
    if tokenizer_name is None:
        tokenizer_name = model_name
    if dtype is None:
-        dtype = "bfloat16" if is_cpu() else "half"
+        dtype = "bfloat16" if current_platform.is_cpu() else "half"

    model_config = ModelConfig(
        model_name,

--- a/tests/worker/test_encoder_decoder_model_runner.py
+++ b/tests/worker/test_encoder_decoder_model_runner.py
@@ -5,8 +5,9 @@ import pytest
 import torch

 from vllm.engine.arg_utils import EngineArgs
+from vllm.platforms import current_platform
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-from vllm.utils import is_cpu, make_tensor_with_pad
+from vllm.utils import make_tensor_with_pad
 from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
 from vllm.worker.model_runner import _get_graph_batch_size

@@ -31,7 +32,7 @@ def _create_model_runner(model: str, *args,
    return model_runner


-@pytest.mark.skipif(condition=is_cpu(),
+@pytest.mark.skipif(condition=current_platform.is_cpu(),
                    reason="CPU backend is currently "
                    "unsupported for encoder/ "
                    "decoder models")
@@ -74,7 +75,7 @@ def test_empty_seq_group():
    assert return_seq_lens is None


-@pytest.mark.skipif(condition=is_cpu(),
+@pytest.mark.skipif(condition=current_platform.is_cpu(),
                    reason="CPU backend is currently "
                    "unsupported for encoder/ "
                    "decoder models")
@@ -264,7 +265,7 @@ def test_prepare_prompt(batch_size):
    assert torch.equal(actual, expected)


-@pytest.mark.skipif(condition=is_cpu(),
+@pytest.mark.skipif(condition=current_platform.is_cpu(),
                    reason="CPU backend is currently "
                    "unsupported for encoder/ "
                    "decoder models")

--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -10,9 +10,9 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                              AttentionMetadata, AttentionType)
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.attention.ops.paged_attn import PagedAttentionMetadata
-from vllm.utils import is_cpu
+from vllm.platforms import current_platform

-if is_cpu():
+if current_platform.is_cpu():
    try:
        from vllm.attention.ops.ipex_attn import PagedAttention
    except ImportError:

--- a/vllm/attention/ops/blocksparse_attention/interface.py
+++ b/vllm/attention/ops/blocksparse_attention/interface.py
@@ -3,7 +3,7 @@ import math
 import torch

 from vllm.platforms import current_platform
-from vllm.utils import is_cpu, is_hip
+from vllm.utils import is_hip

 from .utils import (dense_to_crow_col, get_head_sliding_step,
                    get_sparse_attn_mask)
@@ -32,7 +32,7 @@ class LocalStridedBlockSparseAttn(torch.nn.Module):
    ):
        super().__init__()
        if use_spda is None:
-            use_spda = is_hip() or is_cpu() or not \
+            use_spda = is_hip() or current_platform.is_cpu() or not \
                       IS_COMPUTE_8_OR_ABOVE
        device = device or (torch.cuda.current_device()
                            if current_platform.is_cuda_alike() else "cpu")

--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -10,7 +10,7 @@ import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import STR_BACKEND_ENV_VAR, is_cpu, is_hip, is_openvino, is_xpu
+from vllm.utils import STR_BACKEND_ENV_VAR, is_hip, is_openvino, is_xpu

 logger = init_logger(__name__)

@@ -121,7 +121,7 @@ def get_attn_backend(
            ROCmFlashAttentionBackend)
        return ROCmFlashAttentionBackend
    elif backend == _Backend.TORCH_SDPA:
-        assert is_cpu(), RuntimeError(
+        assert current_platform.is_cpu(), RuntimeError(
            "Torch SDPA backend is only used for the CPU device.")
        logger.info("Using Torch SDPA backend.")
        from vllm.attention.backends.torch_sdpa import TorchSDPABackend
@@ -183,7 +183,7 @@ def which_attn_to_use(
        if backend_by_env_var is not None:
            selected_backend = backend_name_to_enum(backend_by_env_var)

-    if is_cpu():
+    if current_platform.is_cpu():
        if selected_backend != _Backend.TORCH_SDPA:
            logger.info("Cannot use %s backend on CPU.", selected_backend)
        return _Backend.TORCH_SDPA

--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -37,7 +37,7 @@ from torch.distributed import Backend, ProcessGroup
 import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import is_cpu, supports_custom_op
+from vllm.utils import supports_custom_op


 @dataclass
@@ -1139,7 +1139,7 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
        import ray  # Lazy import Ray
        ray.shutdown()
    gc.collect()
-    if not is_cpu():
+    if not current_platform.is_cpu():
        torch.cuda.empty_cache()



--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -7,7 +7,7 @@ import vllm.envs as envs
 from vllm.compilation.levels import CompilationLevel
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import is_cpu, is_hip, is_xpu, print_warning_once
+from vllm.utils import is_hip, is_xpu, print_warning_once

 logger = init_logger(__name__)

@@ -74,7 +74,7 @@ class CustomOp(nn.Module):

        if is_hip():
            return self.forward_hip
-        elif is_cpu():
+        elif current_platform.is_cpu():
            return self.forward_cpu
        elif current_platform.is_tpu():
            return self.forward_tpu

--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -21,7 +21,7 @@ from vllm.model_executor.models import ModelRegistry
 from vllm.multimodal.base import NestedTensors
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils import is_cpu, is_pin_memory_available
+from vllm.utils import is_pin_memory_available

 logger = init_logger(__name__)

@@ -515,7 +515,7 @@ def get_vit_attn_backend() -> _Backend:
                    "so we use xformers backend instead. You can run "
                    "`pip install flash-attn` to use flash-attention backend.")
                selected_backend = _Backend.XFORMERS
-        elif is_cpu():
+        elif current_platform.is_cpu():
            selected_backend = _Backend.TORCH_SDPA
        else:
            selected_backend = _Backend.XFORMERS

--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -318,15 +318,6 @@ def is_hip() -> bool:
    return torch.version.hip is not None


-@lru_cache(maxsize=None)
-def is_cpu() -> bool:
-    from importlib.metadata import PackageNotFoundError, version
-    try:
-        return "cpu" in version("vllm")
-    except PackageNotFoundError:
-        return False
-
-
 @lru_cache(maxsize=None)
 def is_openvino() -> bool:
    from importlib.metadata import PackageNotFoundError, version
@@ -798,7 +789,7 @@ def is_pin_memory_available() -> bool:
    elif is_neuron():
        print_warning_once("Pin memory is not supported on Neuron.")
        return False
-    elif is_cpu() or is_openvino():
+    elif current_platform.is_cpu() or is_openvino():
        return False
    return True