[ROCm][CI] Expose tests to AMD production CI and fix amdsmi heap corruption (#35071)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-02-27 23:57:31 -06:00
parent 94029ffaf0
commit f5d1281c9d
2 changed files with 67 additions and 42 deletions
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -156,8 +156,9 @@ steps:

 - label: Entrypoints Integration Test (API Server 1) # 100min
  timeout_in_minutes: 130
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
+  optional: true
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
  fast_check: true
@@ -173,8 +174,9 @@ steps:

 - label: Entrypoints Integration Test (API Server 2)
  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
+  optional: true
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
  fast_check: true
@@ -192,8 +194,9 @@ steps:

 - label: Entrypoints Integration Test (Pooling)
  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
+  optional: true
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
  fast_check: true
@@ -207,8 +210,9 @@ steps:

 - label: Entrypoints Integration Test (Responses API)
  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
+  optional: true
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
  fast_check: true
@@ -222,8 +226,9 @@ steps:

 - label: Distributed Tests (4 GPUs) # 35min
  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
+  optional: true
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
@@ -285,8 +290,9 @@ steps:

 - label: Distributed Tests (8 GPUs) # 4min
  timeout_in_minutes: 10
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_8
+  optional: true
  # grade: Blocking
  gpu: h100
  num_gpus: 8
@@ -381,10 +387,11 @@ steps:

 - label: V1 Test e2e + engine # 65min
  timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability.
  # See discussion here: https://github.com/vllm-project/vllm/pull/31040
  agent_pool: mi325_8
+  optional: true
  # grade: Blocking
  source_file_dependencies:
    - vllm/
@@ -408,8 +415,9 @@ steps:

 - label: V1 Test others # 42min
  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
+  optional: true
  # grade: Blocking
  source_file_dependencies:
    - vllm/
@@ -436,8 +444,9 @@ steps:
 # TODO: Add the "V1 Test attetion (MI300)" test group

 - label: V1 Test attention (H100) # 10min
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
+  optional: true
  # grade: Blocking
  timeout_in_minutes: 30
  gpu: h100
@@ -541,8 +550,9 @@ steps:

 - label: Samplers Test # 56min
  timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
+  optional: true
  # grade: Blocking
  source_file_dependencies:
  - vllm/model_executor/layers
@@ -554,8 +564,9 @@ steps:

 - label: LoRA Test %N # 20min each
  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
+  optional: true
  # grade: Blocking
  source_file_dependencies:
  - vllm/lora
@@ -665,8 +676,9 @@ steps:

 - label: Kernels Quantization Test %N # 64min
  timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
+  optional: true
  # grade: Blocking
  source_file_dependencies:
  - csrc/quantization/
@@ -799,8 +811,9 @@ steps:

 - label: LM Eval Small Models # 53min
  timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
+  optional: true
  # grade: Blocking
  source_file_dependencies:
  - csrc/
@@ -861,8 +874,9 @@ steps:

 - label: Basic Models Tests (Other)
  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
+  optional: true
  # grade: Blocking
  torch_nightly: true
  source_file_dependencies:
@@ -903,8 +917,9 @@ steps:

 - label: Language Models Tests (Extra Standard) %N
  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
+  optional: true
  # grade: Blocking
  torch_nightly: true
  source_file_dependencies:
@@ -924,8 +939,9 @@ steps:

 - label: Language Models Tests (Hybrid) %N
  timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
+  optional: true
  # grade: Blocking
  torch_nightly: true
  source_file_dependencies:
@@ -945,7 +961,7 @@ steps:

 - label: Language Models Test (Extended Generation) # 80min
  timeout_in_minutes: 110
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  optional: true
@@ -961,7 +977,7 @@ steps:

 - label: Language Models Test (PPL)
  timeout_in_minutes: 110
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  optional: true
@@ -973,7 +989,7 @@ steps:

 - label: Language Models Test (Extended Pooling)  # 36min
  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  optional: true
@@ -985,7 +1001,7 @@ steps:

 - label: Language Models Test (MTEB)
  timeout_in_minutes: 110
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  optional: true
@@ -997,7 +1013,7 @@ steps:

 - label: Multi-Modal Processor Test (CPU)
  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  source_file_dependencies:
  - vllm/
@@ -1009,7 +1025,7 @@ steps:

 - label: Multi-Modal Processor Test # 44min
  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
@@ -1021,7 +1037,7 @@ steps:

 - label: Multi-Modal Models Test (Standard) # 60min
  timeout_in_minutes: 100
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  torch_nightly: true
@@ -1054,7 +1070,7 @@ steps:

 - label: Multi-Modal Models Test (Extended) 1 # 60min
  timeout_in_minutes: 120
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  optional: true
@@ -1069,7 +1085,7 @@ steps:

 - label: Multi-Modal Models Test (Extended) 2 #60min
  timeout_in_minutes: 120
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  optional: true
@@ -1084,7 +1100,7 @@ steps:

 - label: Multi-Modal Models Test (Extended) 3 # 75min
  timeout_in_minutes: 150
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  optional: true
@@ -1109,7 +1125,7 @@ steps:
    - pytest -v -s models/quantization

 - label: Transformers Nightly Models Test
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  working_dir: "/vllm-workspace/"
@@ -1264,8 +1280,9 @@ steps:

 - label: 2 Node Tests (4 GPUs in total) # 16min
  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdmultinode]
+  mirror_hardwares: [amdexperimental, amdproduction, amdmultinode]
  agent_pool: mi325_4
+  optional: true
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
@@ -1291,8 +1308,9 @@ steps:

 - label: Distributed Tests (2 GPUs) # 68min
  timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_2
+  optional: true
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
@@ -1331,8 +1349,9 @@ steps:

 - label: Distributed Model Tests (2 GPUs) # 37min
  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_2
+  optional: true
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
@@ -1442,7 +1461,7 @@ steps:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt

 - label: Weight Loading Multiple GPU Test - Large Models # optional
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_2
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
@@ -1486,7 +1505,7 @@ steps:
 ##### A100 test #####

 - label: Distributed Tests (A100) # optional
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
  # grade: Blocking
  gpu: a100
@@ -1509,7 +1528,7 @@ steps:
 - label: LM Eval Large Models # optional
  gpu: a100
  optional: true
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
  # grade: Blocking
  num_gpus: 4
@@ -1525,7 +1544,7 @@ steps:
 - label: LM Eval Large Models (H100) # optional
  gpu: h100
  optional: true
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
  # grade: Blocking
  num_gpus: 4
@@ -1540,7 +1559,7 @@ steps:

 ##### H200 test #####
 - label: Distributed Tests (H200) # optional
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_2
  # grade: Blocking
  gpu: h200
@@ -1600,8 +1619,9 @@ steps:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

 - label: ROCm LM Eval Large Models (8 Card)
-  mirror_hardwares: [amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_8
+  optional: true
  num_gpus: 8
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  commands:
@@ -1660,7 +1680,7 @@ steps:

 - label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
  # grade: Blocking
  optional: true
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -65,6 +65,8 @@ from vllm.utils.torch_utils import (
 FP8_DTYPE = current_platform.fp8_dtype()

 if current_platform.is_rocm():
+    import threading
+
    from amdsmi import (
        amdsmi_get_gpu_vram_usage,
        amdsmi_get_processor_handles,
@@ -72,13 +74,16 @@ if current_platform.is_rocm():
        amdsmi_shut_down,
    )

+    _amdsmi_lock = threading.Lock()
+
    @contextmanager
    def _nvml():
-        try:
-            amdsmi_init()
-            yield
-        finally:
-            amdsmi_shut_down()
+        with _amdsmi_lock:
+            try:
+                amdsmi_init()
+                yield
+            finally:
+                amdsmi_shut_down()
 elif current_platform.is_cuda():
    from vllm.third_party.pynvml import (
        nvmlDeviceGetHandleByIndex,