[TPU] Suppress import custom_ops warning (#7458)

d6e634f3 · Woosuk Kwon · GitHub · 4d2dc507 · d6e634f3 · d6e634f3
Unverified Commit d6e634f3 authored Aug 13, 2024 by Woosuk Kwon Committed by GitHub Aug 13, 2024
Show whitespace changes
Inline Side-by-side

Showing with 7 additions and 5 deletions

vllm/_custom_ops.py vllm/_custom_ops.py +6 -4

vllm/utils.py vllm/utils.py +1 -1

No files found.
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -6,12 +6,14 @@ import torch
 from vllm._core_ext import ScalarType
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 logger = init_logger(__name__)
-try:
+if not current_platform.is_tpu():
+    try:
        import vllm._C
-except ImportError as e:
+    except ImportError as e:
        logger.warning("Failed to import from vllm._C with %r", e)
 with contextlib.suppress(ImportError):

--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -29,7 +29,6 @@ import torch.types
 from typing_extensions import ParamSpec, TypeIs, assert_never
 import vllm.envs as envs
-from vllm import _custom_ops as ops
 from vllm.logger import enable_trace_function_call, init_logger
 logger = init_logger(__name__)
@@ -359,6 +358,7 @@ def is_xpu() -> bool:
 @lru_cache(maxsize=None)
 def get_max_shared_memory_bytes(gpu: int = 0) -> int:
    """Returns the maximum shared memory per thread block in bytes."""
+    from vllm import _custom_ops as ops
    max_shared_mem = (
        ops.get_max_shared_memory_per_block_device_attribute(gpu))
    # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py