[Kernel] [Helion] [9/N] Canonicalize GPU variant names to base model names (#34928)

Signed-off-by: Yanan Cao <gmagogsfm@gmail.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Yanan Cao
2026-02-20 19:55:51 -08:00
committed by GitHub
parent e739c29ea4
commit 9d7577b2bd
4 changed files with 59 additions and 27733 deletions

View File

@@ -71,10 +71,18 @@ class ConfigSet:
platform_dict = self._configs.get(platform)
if platform_dict is None:
avail_platforms = self.get_platforms()
# TODO(@gmagogsfm): add a CLI/env override flag so users can
# directly specify a platform name instead of relying on
# auto-detection, and suggest it in this error message.
raise KeyError(
f"Config not found for kernel '{self._kernel_name}': "
f"platform '{platform}' not found. "
f"Available platforms: {avail_platforms or '(none)'}"
f"Available platforms: {avail_platforms or '(none)'}. "
f"If your GPU is a variant of a supported platform, "
f"consider adding a mapping in _GPU_NAME_ALIASES in "
f"vllm/kernels/helion/utils.py, or run "
f"scripts/autotune_helion_kernels.py to generate configs "
f"for your platform."
)
config = platform_dict.get(config_key)

File diff suppressed because it is too large Load Diff

View File

@@ -8,6 +8,44 @@ from vllm.platforms import current_platform
logger = logging.getLogger(__name__)
# Maps known variant GPU names (after lowercase/underscore normalization)
# to their canonical form.
#
# Names that are already canonical after normalization are NOT listed here.
# For example, "NVIDIA H200" normalizes to "nvidia_h200" which needs no
# further mapping, and AMD ROCm names like "AMD_Instinct_MI300X" come from
# a controlled lookup table in rocm.py and normalize cleanly to
# "amd_instinct_mi300x". Only names with variant suffixes (form factor,
# memory size, memory type, etc.) that should be stripped need entries.
#
# To add a new GPU variant: run `canonicalize_gpu_name()` without the alias
# to see the normalized name, then add a mapping here if it contains variant
# suffixes that should be stripped (e.g. Blackwell/Rubin variants).
_GPU_NAME_ALIASES: dict[str, str] = {
# H100 variants
"nvidia_h100_pcie": "nvidia_h100",
"nvidia_h100_sxm5": "nvidia_h100",
"nvidia_h100_80gb_hbm3": "nvidia_h100",
"nvidia_h100_nvl": "nvidia_h100",
# H200 variants
"nvidia_h200_nvl": "nvidia_h200",
"nvidia_h200_141gb_hbm3e": "nvidia_h200",
# A100 variants
"nvidia_a100_sxm4_80gb": "nvidia_a100",
"nvidia_a100_sxm4_40gb": "nvidia_a100",
"nvidia_a100_pcie_80gb": "nvidia_a100",
"nvidia_a100_pcie_40gb": "nvidia_a100",
"nvidia_a100_80gb_pcie": "nvidia_a100",
# V100 variants (Tesla-branded)
"tesla_v100_sxm2_32gb": "tesla_v100",
"tesla_v100_sxm2_16gb": "tesla_v100",
"tesla_v100_pcie_32gb": "tesla_v100",
"tesla_v100_pcie_16gb": "tesla_v100",
# AMD ROCm variants (from _ROCM_DEVICE_ID_NAME_MAP in rocm.py)
"amd_instinct_mi300x_hf": "amd_instinct_mi300x",
# ADD MORE HERE
}
def get_gpu_name(device_id: int | None = None) -> str:
if device_id is None:
@@ -23,17 +61,19 @@ def canonicalize_gpu_name(name: str) -> str:
"""
Canonicalize GPU name for use as a platform identifier.
Converts to lowercase and replaces spaces and hyphens with underscores.
e.g., "NVIDIA A100-SXM4-80GB" -> "nvidia_a100_sxm4_80gb"
"AMD_Instinct_MI300X" -> "amd_instinct_mi300x"
Raises ValueError if name is empty.
Converts to lowercase, replaces spaces and hyphens with underscores,
and maps known variant names to their canonical form via _GPU_NAME_ALIASES.
e.g., "NVIDIA H100 80GB HBM3" -> "nvidia_h100"
"NVIDIA A100-SXM4-80GB" -> "nvidia_a100"
"AMD Instinct MI300X" -> "amd_instinct_mi300x"
"""
if not name or not name.strip():
raise ValueError("GPU name cannot be empty")
name = name.lower()
name = name.replace(" ", "_")
name = name.replace("-", "_")
if name in _GPU_NAME_ALIASES:
return _GPU_NAME_ALIASES[name]
return name