Unverified Commit 9d7577b2 authored by Yanan Cao's avatar Yanan Cao Committed by GitHub
Browse files

[Kernel] [Helion] [9/N] Canonicalize GPU variant names to base model names (#34928)


Signed-off-by: default avatarYanan Cao <gmagogsfm@gmail.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent e739c29e
...@@ -11,11 +11,13 @@ from vllm.kernels.helion.utils import canonicalize_gpu_name ...@@ -11,11 +11,13 @@ from vllm.kernels.helion.utils import canonicalize_gpu_name
"driver_reported_name,expected", "driver_reported_name,expected",
[ [
("NVIDIA H200", "nvidia_h200"), ("NVIDIA H200", "nvidia_h200"),
("NVIDIA A100-SXM4-80GB", "nvidia_a100_sxm4_80gb"), ("NVIDIA A100-SXM4-80GB", "nvidia_a100"),
("NVIDIA H100 80GB HBM3", "nvidia_h100_80gb_hbm3"), ("NVIDIA H100 80GB HBM3", "nvidia_h100"),
("NVIDIA H100 PCIe", "nvidia_h100"),
("NVIDIA H100 SXM5", "nvidia_h100"),
("NVIDIA GeForce RTX 4090", "nvidia_geforce_rtx_4090"), ("NVIDIA GeForce RTX 4090", "nvidia_geforce_rtx_4090"),
("AMD Instinct MI300X", "amd_instinct_mi300x"), ("AMD Instinct MI300X", "amd_instinct_mi300x"),
("Tesla V100-SXM2-32GB", "tesla_v100_sxm2_32gb"), ("Tesla V100-SXM2-32GB", "tesla_v100"),
], ],
) )
def test_canonicalize_gpu_name(driver_reported_name, expected): def test_canonicalize_gpu_name(driver_reported_name, expected):
......
...@@ -71,10 +71,18 @@ class ConfigSet: ...@@ -71,10 +71,18 @@ class ConfigSet:
platform_dict = self._configs.get(platform) platform_dict = self._configs.get(platform)
if platform_dict is None: if platform_dict is None:
avail_platforms = self.get_platforms() avail_platforms = self.get_platforms()
# TODO(@gmagogsfm): add a CLI/env override flag so users can
# directly specify a platform name instead of relying on
# auto-detection, and suggest it in this error message.
raise KeyError( raise KeyError(
f"Config not found for kernel '{self._kernel_name}': " f"Config not found for kernel '{self._kernel_name}': "
f"platform '{platform}' not found. " f"platform '{platform}' not found. "
f"Available platforms: {avail_platforms or '(none)'}" f"Available platforms: {avail_platforms or '(none)'}. "
f"If your GPU is a variant of a supported platform, "
f"consider adding a mapping in _GPU_NAME_ALIASES in "
f"vllm/kernels/helion/utils.py, or run "
f"scripts/autotune_helion_kernels.py to generate configs "
f"for your platform."
) )
config = platform_dict.get(config_key) config = platform_dict.get(config_key)
......
...@@ -8,6 +8,44 @@ from vllm.platforms import current_platform ...@@ -8,6 +8,44 @@ from vllm.platforms import current_platform
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Maps known variant GPU names (after lowercase/underscore normalization)
# to their canonical form.
#
# Names that are already canonical after normalization are NOT listed here.
# For example, "NVIDIA H200" normalizes to "nvidia_h200" which needs no
# further mapping, and AMD ROCm names like "AMD_Instinct_MI300X" come from
# a controlled lookup table in rocm.py and normalize cleanly to
# "amd_instinct_mi300x". Only names with variant suffixes (form factor,
# memory size, memory type, etc.) that should be stripped need entries.
#
# To add a new GPU variant: run `canonicalize_gpu_name()` without the alias
# to see the normalized name, then add a mapping here if it contains variant
# suffixes that should be stripped (e.g. Blackwell/Rubin variants).
_GPU_NAME_ALIASES: dict[str, str] = {
# H100 variants
"nvidia_h100_pcie": "nvidia_h100",
"nvidia_h100_sxm5": "nvidia_h100",
"nvidia_h100_80gb_hbm3": "nvidia_h100",
"nvidia_h100_nvl": "nvidia_h100",
# H200 variants
"nvidia_h200_nvl": "nvidia_h200",
"nvidia_h200_141gb_hbm3e": "nvidia_h200",
# A100 variants
"nvidia_a100_sxm4_80gb": "nvidia_a100",
"nvidia_a100_sxm4_40gb": "nvidia_a100",
"nvidia_a100_pcie_80gb": "nvidia_a100",
"nvidia_a100_pcie_40gb": "nvidia_a100",
"nvidia_a100_80gb_pcie": "nvidia_a100",
# V100 variants (Tesla-branded)
"tesla_v100_sxm2_32gb": "tesla_v100",
"tesla_v100_sxm2_16gb": "tesla_v100",
"tesla_v100_pcie_32gb": "tesla_v100",
"tesla_v100_pcie_16gb": "tesla_v100",
# AMD ROCm variants (from _ROCM_DEVICE_ID_NAME_MAP in rocm.py)
"amd_instinct_mi300x_hf": "amd_instinct_mi300x",
# ADD MORE HERE
}
def get_gpu_name(device_id: int | None = None) -> str: def get_gpu_name(device_id: int | None = None) -> str:
if device_id is None: if device_id is None:
...@@ -23,17 +61,19 @@ def canonicalize_gpu_name(name: str) -> str: ...@@ -23,17 +61,19 @@ def canonicalize_gpu_name(name: str) -> str:
""" """
Canonicalize GPU name for use as a platform identifier. Canonicalize GPU name for use as a platform identifier.
Converts to lowercase and replaces spaces and hyphens with underscores. Converts to lowercase, replaces spaces and hyphens with underscores,
e.g., "NVIDIA A100-SXM4-80GB" -> "nvidia_a100_sxm4_80gb" and maps known variant names to their canonical form via _GPU_NAME_ALIASES.
"AMD_Instinct_MI300X" -> "amd_instinct_mi300x" e.g., "NVIDIA H100 80GB HBM3" -> "nvidia_h100"
"NVIDIA A100-SXM4-80GB" -> "nvidia_a100"
Raises ValueError if name is empty. "AMD Instinct MI300X" -> "amd_instinct_mi300x"
""" """
if not name or not name.strip(): if not name or not name.strip():
raise ValueError("GPU name cannot be empty") raise ValueError("GPU name cannot be empty")
name = name.lower() name = name.lower()
name = name.replace(" ", "_") name = name.replace(" ", "_")
name = name.replace("-", "_") name = name.replace("-", "_")
if name in _GPU_NAME_ALIASES:
return _GPU_NAME_ALIASES[name]
return name return name
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment