import enum import os import socket import uuid from platform import uname from typing import List import psutil import torch class Device(enum.Enum): GPU = enum.auto() CPU = enum.auto() class Counter: def __init__(self, start: int = 0) -> None: self.counter = start def __next__(self) -> int: i = self.counter self.counter += 1 return i def reset(self) -> None: self.counter = 0 def is_hip() -> bool: return torch.version.hip is not None def get_max_shared_memory_bytes(gpu: int = 0) -> int: """Returns the maximum shared memory per thread block in bytes.""" # NOTE: This import statement should be executed lazily since # the Neuron-X backend does not have the `cuda_utils` module. from vllm._C import cuda_utils # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74 max_shared_mem = cuda_utils.get_device_attribute( cudaDevAttrMaxSharedMemoryPerBlockOptin, gpu) return int(max_shared_mem) def get_cpu_memory() -> int: """Returns the total CPU memory of the node in bytes.""" return psutil.virtual_memory().total def random_uuid() -> str: return str(uuid.uuid4().hex) def in_wsl() -> bool: # Reference: https://github.com/microsoft/WSL/issues/4071 return "microsoft" in " ".join(uname()).lower() def get_ip() -> str: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect(("8.8.8.8", 80)) # Doesn't need to be reachable return s.getsockname()[0] def get_distributed_init_method(ip: str, port: int) -> str: return f"tcp://{ip}:{port}" def get_open_port() -> int: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.bind(("", 0)) return s.getsockname()[1] def set_cuda_visible_devices(device_ids: List[int]) -> None: os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, device_ids))