utils.py 3.05 KB
Newer Older
1
2
3
4
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Utility functions for Helion kernel management."""

5
from vllm.logger import init_logger
6
7
from vllm.platforms import current_platform

8
logger = init_logger(__name__)
9

10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# Maps known variant GPU names (after lowercase/underscore normalization)
# to their canonical form.
#
# Names that are already canonical after normalization are NOT listed here.
# For example, "NVIDIA H200" normalizes to "nvidia_h200" which needs no
# further mapping, and AMD ROCm names like "AMD_Instinct_MI300X" come from
# a controlled lookup table in rocm.py and normalize cleanly to
# "amd_instinct_mi300x". Only names with variant suffixes (form factor,
# memory size, memory type, etc.) that should be stripped need entries.
#
# To add a new GPU variant: run `canonicalize_gpu_name()` without the alias
# to see the normalized name, then add a mapping here if it contains variant
# suffixes that should be stripped (e.g. Blackwell/Rubin variants).
_GPU_NAME_ALIASES: dict[str, str] = {
    # H100 variants
    "nvidia_h100_pcie": "nvidia_h100",
    "nvidia_h100_sxm5": "nvidia_h100",
    "nvidia_h100_80gb_hbm3": "nvidia_h100",
    "nvidia_h100_nvl": "nvidia_h100",
    # H200 variants
    "nvidia_h200_nvl": "nvidia_h200",
    "nvidia_h200_141gb_hbm3e": "nvidia_h200",
    # A100 variants
    "nvidia_a100_sxm4_80gb": "nvidia_a100",
    "nvidia_a100_sxm4_40gb": "nvidia_a100",
    "nvidia_a100_pcie_80gb": "nvidia_a100",
    "nvidia_a100_pcie_40gb": "nvidia_a100",
    "nvidia_a100_80gb_pcie": "nvidia_a100",
    # V100 variants (Tesla-branded)
    "tesla_v100_sxm2_32gb": "tesla_v100",
    "tesla_v100_sxm2_16gb": "tesla_v100",
    "tesla_v100_pcie_32gb": "tesla_v100",
    "tesla_v100_pcie_16gb": "tesla_v100",
    # AMD ROCm variants (from _ROCM_DEVICE_ID_NAME_MAP in rocm.py)
    "amd_instinct_mi300x_hf": "amd_instinct_mi300x",
    # ADD MORE HERE
}

48
49
50

def get_gpu_name(device_id: int | None = None) -> str:
    if device_id is None:
51
        logger.warning_once(
52
53
54
55
56
            "get_gpu_name() called without device_id, defaulting to 0. "
            "This may return the wrong device name in multi-node setups."
        )
        device_id = 0
    return current_platform.get_device_name(device_id)
57
58
59
60
61
62


def canonicalize_gpu_name(name: str) -> str:
    """
    Canonicalize GPU name for use as a platform identifier.

63
64
65
66
67
    Converts to lowercase, replaces spaces and hyphens with underscores,
    and maps known variant names to their canonical form via _GPU_NAME_ALIASES.
    e.g., "NVIDIA H100 80GB HBM3" -> "nvidia_h100"
          "NVIDIA A100-SXM4-80GB" -> "nvidia_a100"
          "AMD Instinct MI300X"   -> "amd_instinct_mi300x"
68
69
70
71
72
73
    """
    if not name or not name.strip():
        raise ValueError("GPU name cannot be empty")
    name = name.lower()
    name = name.replace(" ", "_")
    name = name.replace("-", "_")
74
75
    if name in _GPU_NAME_ALIASES:
        return _GPU_NAME_ALIASES[name]
76
77
78
79
80
    return name


def get_canonical_gpu_name(device_id: int | None = None) -> str:
    return canonicalize_gpu_name(get_gpu_name(device_id))