[ROCm][CI] Expose tests to AMD production CI and fix amdsmi heap corruption (#35071)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
This commit is contained in:
Andreas Karatzas
2026-02-27 23:57:31 -06:00
committed by GitHub
parent 94029ffaf0
commit f5d1281c9d
2 changed files with 67 additions and 42 deletions

View File

@@ -65,6 +65,8 @@ from vllm.utils.torch_utils import (
FP8_DTYPE = current_platform.fp8_dtype()
if current_platform.is_rocm():
import threading
from amdsmi import (
amdsmi_get_gpu_vram_usage,
amdsmi_get_processor_handles,
@@ -72,13 +74,16 @@ if current_platform.is_rocm():
amdsmi_shut_down,
)
_amdsmi_lock = threading.Lock()
@contextmanager
def _nvml():
try:
amdsmi_init()
yield
finally:
amdsmi_shut_down()
with _amdsmi_lock:
try:
amdsmi_init()
yield
finally:
amdsmi_shut_down()
elif current_platform.is_cuda():
from vllm.third_party.pynvml import (
nvmlDeviceGetHandleByIndex,