diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index ab8bf9d23..c5db1ca83 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -388,9 +388,7 @@ steps:
 - label: V1 Test e2e + engine # 65min
   timeout_in_minutes: 90
   mirror_hardwares: [amdexperimental, amdproduction]
-  # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability.
-  # See discussion here: https://github.com/vllm-project/vllm/pull/31040
-  agent_pool: mi325_8
+  agent_pool: mi325_1
   optional: true
   # grade: Blocking
   source_file_dependencies:
@@ -402,6 +400,34 @@ steps:
     - pytest -v -s v1/e2e
     - pytest -v -s v1/engine
 
+- label: V1 Test e2e (2 GPUs) # 65min
+  timeout_in_minutes: 90
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_2
+  optional: true
+  # grade: Blocking
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    # Only run tests that need exactly 2 GPUs
+    - pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
+
+- label: V1 Test e2e (4 GPUs) # 65min
+  timeout_in_minutes: 90
+  mirror_hardwares: [amdexperimental, amdproduction]
+  # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability.
+  # See discussion here: https://github.com/vllm-project/vllm/pull/31040
+  agent_pool: mi325_4
+  optional: true
+  # grade: Blocking
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    # Only run tests that need 4 GPUs
+    - pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
+
 - label: V1 Test entrypoints # 35min
   timeout_in_minutes: 50
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml
index 19cd91370..b5b3eeb6d 100644
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -14,7 +14,7 @@ steps:
   commands:
   - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
 
-- label: V1 e2e + engine
+- label: V1 e2e + engine (1 GPU)
   timeout_in_minutes: 45
   source_file_dependencies:
     - vllm/
@@ -36,3 +36,35 @@ steps:
       commands:
       - pytest -v -s v1/e2e
       - pytest -v -s v1/engine
+
+- label: V1 e2e (2 GPUs)
+  timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
+  optional: true
+  num_devices: 2
+  source_file_dependencies:
+    - vllm/
+    - tests/v1/e2e
+  commands:
+    # Only run tests that need exactly 2 GPUs
+    - pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
+  mirror:
+    amd:
+      device: mi325_2
+      depends_on:
+      - image-build-amd
+
+- label: V1 e2e (4 GPUs)
+  timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
+  optional: true
+  num_devices: 4
+  source_file_dependencies:
+    - vllm/
+    - tests/v1/e2e
+  commands:
+    # Only run tests that need 4 GPUs
+    - pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
+  mirror:
+    amd:
+      device: mi325_4
+      depends_on:
+      - image-build-amd
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 7f2db19a0..4c90df5f4 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -630,7 +630,7 @@ def test_eagle_correctness_medium(
             False,
             "auto",
             0.8,
-            marks=multi_gpu_marks(num_gpus=4),
+            marks=[*multi_gpu_marks(num_gpus=4), large_gpu_mark(min_gb=40)],
             id="llama4_eagle",
         ),
         pytest.param(