[Hardware][AMD] Improve OAM device ID + llama4 Maverick MOE tuning (#16263)

Signed-off-by: Lu Fang <lufang@fb.com>
Co-authored-by: Lu Fang <lufang@fb.com>
This commit is contained in:
Xiaodong Wang
2025-05-02 12:44:19 -07:00
committed by GitHub
parent 182f40ea8b
commit 9352cdb56d
3 changed files with 231 additions and 3 deletions

View File

@@ -58,6 +58,15 @@ _ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {
"excessive use of shared memory. If this happens, disable Triton FA "
"by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
}
_ROCM_DEVICE_ID_NAME_MAP: Dict[str, str] = {
"0x74a0": "AMD_Instinct_MI300A",
"0x74a1": "AMD_Instinct_MI300X",
"0x74b5": "AMD_Instinct_MI300X", # MI300X VF
"0x74a5": "AMD_Instinct_MI325X",
"0x74b9": "AMD_Instinct_MI325X", # MI325X VF
"0x74a9": "AMD_Instinct_MI300X_HF",
"0x74bd": "AMD_Instinct_MI300X_HF",
}
# Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES``
if "HIP_VISIBLE_DEVICES" in os.environ:
@@ -225,7 +234,11 @@ class RocmPlatform(Platform):
def get_device_name(cls, device_id: int = 0) -> str:
physical_device_id = device_id_to_physical_device_id(device_id)
handle = amdsmi_get_processor_handles()[physical_device_id]
return amdsmi_get_gpu_asic_info(handle)["market_name"]
asic_info = amdsmi_get_gpu_asic_info(handle)
device_name: str = asic_info["device_id"]
if device_name in _ROCM_DEVICE_ID_NAME_MAP:
return _ROCM_DEVICE_ID_NAME_MAP[device_name]
return asic_info["market_name"]
@classmethod
def get_device_total_memory(cls, device_id: int = 0) -> int: