diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 65701b78b..4c15e7382 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -156,8 +156,9 @@ steps:
 
 - label: Entrypoints Integration Test (API Server 1) # 100min
   timeout_in_minutes: 130
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
+  optional: true
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
   fast_check: true
@@ -173,8 +174,9 @@ steps:
 
 - label: Entrypoints Integration Test (API Server 2)
   timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
+  optional: true
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
   fast_check: true
@@ -192,8 +194,9 @@ steps:
 
 - label: Entrypoints Integration Test (Pooling)
   timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
+  optional: true
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
   fast_check: true
@@ -207,8 +210,9 @@ steps:
 
 - label: Entrypoints Integration Test (Responses API)
   timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
+  optional: true
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
   fast_check: true
@@ -222,8 +226,9 @@ steps:
 
 - label: Distributed Tests (4 GPUs) # 35min
   timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_4
+  optional: true
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
@@ -285,8 +290,9 @@ steps:
 
 - label: Distributed Tests (8 GPUs) # 4min
   timeout_in_minutes: 10
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_8
+  optional: true
   # grade: Blocking
   gpu: h100
   num_gpus: 8
@@ -381,10 +387,11 @@ steps:
 
 - label: V1 Test e2e + engine # 65min
   timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability.
   # See discussion here: https://github.com/vllm-project/vllm/pull/31040
   agent_pool: mi325_8
+  optional: true
   # grade: Blocking
   source_file_dependencies:
     - vllm/
@@ -408,8 +415,9 @@ steps:
 
 - label: V1 Test others # 42min
   timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
+  optional: true
   # grade: Blocking
   source_file_dependencies:
     - vllm/
@@ -436,8 +444,9 @@ steps:
 # TODO: Add the "V1 Test attetion (MI300)" test group
 
 - label: V1 Test attention (H100) # 10min
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
+  optional: true
   # grade: Blocking
   timeout_in_minutes: 30
   gpu: h100
@@ -541,8 +550,9 @@ steps:
 
 - label: Samplers Test # 56min
   timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
+  optional: true
   # grade: Blocking
   source_file_dependencies:
   - vllm/model_executor/layers
@@ -554,8 +564,9 @@ steps:
 
 - label: LoRA Test %N # 20min each
   timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
+  optional: true
   # grade: Blocking
   source_file_dependencies:
   - vllm/lora
@@ -665,8 +676,9 @@ steps:
 
 - label: Kernels Quantization Test %N # 64min
   timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
+  optional: true
   # grade: Blocking
   source_file_dependencies:
   - csrc/quantization/
@@ -799,8 +811,9 @@ steps:
 
 - label: LM Eval Small Models # 53min
   timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
+  optional: true
   # grade: Blocking
   source_file_dependencies:
   - csrc/
@@ -861,8 +874,9 @@ steps:
 
 - label: Basic Models Tests (Other)
   timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
+  optional: true
   # grade: Blocking
   torch_nightly: true
   source_file_dependencies:
@@ -903,8 +917,9 @@ steps:
 
 - label: Language Models Tests (Extra Standard) %N
   timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
+  optional: true
   # grade: Blocking
   torch_nightly: true
   source_file_dependencies:
@@ -924,8 +939,9 @@ steps:
 
 - label: Language Models Tests (Hybrid) %N
   timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
+  optional: true
   # grade: Blocking
   torch_nightly: true
   source_file_dependencies:
@@ -945,7 +961,7 @@ steps:
 
 - label: Language Models Test (Extended Generation) # 80min
   timeout_in_minutes: 110
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   optional: true
@@ -961,7 +977,7 @@ steps:
 
 - label: Language Models Test (PPL)
   timeout_in_minutes: 110
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   optional: true
@@ -973,7 +989,7 @@ steps:
 
 - label: Language Models Test (Extended Pooling)  # 36min
   timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   optional: true
@@ -985,7 +1001,7 @@ steps:
 
 - label: Language Models Test (MTEB)
   timeout_in_minutes: 110
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   optional: true
@@ -997,7 +1013,7 @@ steps:
 
 - label: Multi-Modal Processor Test (CPU)
   timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   source_file_dependencies:
   - vllm/
@@ -1009,7 +1025,7 @@ steps:
 
 - label: Multi-Modal Processor Test # 44min
   timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   source_file_dependencies:
@@ -1021,7 +1037,7 @@ steps:
 
 - label: Multi-Modal Models Test (Standard) # 60min
   timeout_in_minutes: 100
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   torch_nightly: true
@@ -1054,7 +1070,7 @@ steps:
 
 - label: Multi-Modal Models Test (Extended) 1 # 60min
   timeout_in_minutes: 120
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   optional: true
@@ -1069,7 +1085,7 @@ steps:
 
 - label: Multi-Modal Models Test (Extended) 2 #60min
   timeout_in_minutes: 120
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   optional: true
@@ -1084,7 +1100,7 @@ steps:
 
 - label: Multi-Modal Models Test (Extended) 3 # 75min
   timeout_in_minutes: 150
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   optional: true
@@ -1109,7 +1125,7 @@ steps:
     - pytest -v -s models/quantization
 
 - label: Transformers Nightly Models Test
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   working_dir: "/vllm-workspace/"
@@ -1264,8 +1280,9 @@ steps:
 
 - label: 2 Node Tests (4 GPUs in total) # 16min
   timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdmultinode]
+  mirror_hardwares: [amdexperimental, amdproduction, amdmultinode]
   agent_pool: mi325_4
+  optional: true
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
@@ -1291,8 +1308,9 @@ steps:
 
 - label: Distributed Tests (2 GPUs) # 68min
   timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_2
+  optional: true
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
@@ -1331,8 +1349,9 @@ steps:
 
 - label: Distributed Model Tests (2 GPUs) # 37min
   timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_2
+  optional: true
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
@@ -1442,7 +1461,7 @@ steps:
     - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt
 
 - label: Weight Loading Multiple GPU Test - Large Models # optional
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_2
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
@@ -1486,7 +1505,7 @@ steps:
 ##### A100 test #####
 
 - label: Distributed Tests (A100) # optional
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_4
   # grade: Blocking
   gpu: a100
@@ -1509,7 +1528,7 @@ steps:
 - label: LM Eval Large Models # optional
   gpu: a100
   optional: true
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_4
   # grade: Blocking
   num_gpus: 4
@@ -1525,7 +1544,7 @@ steps:
 - label: LM Eval Large Models (H100) # optional
   gpu: h100
   optional: true
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_4
   # grade: Blocking
   num_gpus: 4
@@ -1540,7 +1559,7 @@ steps:
 
 ##### H200 test #####
 - label: Distributed Tests (H200) # optional
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_2
   # grade: Blocking
   gpu: h200
@@ -1600,8 +1619,9 @@ steps:
   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 
 - label: ROCm LM Eval Large Models (8 Card)
-  mirror_hardwares: [amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_8
+  optional: true
   num_gpus: 8
   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
   commands:
@@ -1660,7 +1680,7 @@ steps:
 
 - label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
   timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_4
   # grade: Blocking
   optional: true
diff --git a/tests/utils.py b/tests/utils.py
index d407733a3..03e5ccadb 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -65,6 +65,8 @@ from vllm.utils.torch_utils import (
 FP8_DTYPE = current_platform.fp8_dtype()
 
 if current_platform.is_rocm():
+    import threading
+
     from amdsmi import (
         amdsmi_get_gpu_vram_usage,
         amdsmi_get_processor_handles,
@@ -72,13 +74,16 @@ if current_platform.is_rocm():
         amdsmi_shut_down,
     )
 
+    _amdsmi_lock = threading.Lock()
+
     @contextmanager
     def _nvml():
-        try:
-            amdsmi_init()
-            yield
-        finally:
-            amdsmi_shut_down()
+        with _amdsmi_lock:
+            try:
+                amdsmi_init()
+                yield
+            finally:
+                amdsmi_shut_down()
 elif current_platform.is_cuda():
     from vllm.third_party.pynvml import (
         nvmlDeviceGetHandleByIndex,