[ci] Sync test areas with test-pipeline.yaml and enable new pipeline generator (#33080)

Signed-off-by: Kevin H. Luu <khluu000@gmail.com> Signed-off-by: khluu <khluu000@gmail.com> Co-authored-by: Kevin Luu <khluu@Kevins-MacBook-Pro.local>
2026-01-26 12:28:20 -08:00
parent 43a013c3a2
commit ebe0ba91db
24 changed files with 528 additions and 102 deletions
--- a/.buildkite/test_areas/attention.yaml
+++ b/.buildkite/test_areas/attention.yaml
@@ -4,7 +4,7 @@ depends_on:
 steps:
 - label: V1 attention (H100)
  timeout_in_minutes: 30
-  gpu: h100
+  device: h100
  source_file_dependencies:
    - vllm/config/attention.py
    - vllm/model_executor/layers/attention
@@ -15,7 +15,7 @@ steps:

 - label: V1 attention (B200)
  timeout_in_minutes: 30
-  gpu: b200
+  device: b200
  source_file_dependencies:
    - vllm/config/attention.py
    - vllm/model_executor/layers/attention
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Fusion and Compile Tests (B200)
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
-  gpu: b200
+  device: b200
  source_file_dependencies:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -26,7 +26,7 @@ steps:
    - nvidia-smi
    - pytest -v -s tests/compile/test_fusion_attn.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
-    # this runner has 2 GPUs available even though num_gpus=2 is not set
+    # this runner has 2 GPUs available even though num_devices=2 is not set
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
    # Wrap with quotes to escape yaml
@@ -37,9 +37,9 @@ steps:
 - label: Fusion E2E (2 GPUs)(B200)
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
-  gpu: b200
+  device: b200
  optional: true
-  num_gpus: 2
+  num_devices: 2
  source_file_dependencies:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Distributed Comm Ops
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  source_file_dependencies:
  - vllm/distributed
  - tests/distributed
@@ -18,7 +18,7 @@ steps:
 - label: Distributed (2 GPUs)
  timeout_in_minutes: 90
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  source_file_dependencies:
  - vllm/compilation/
  - vllm/distributed/
@@ -54,7 +54,7 @@ steps:
 - label: Distributed Tests (4 GPUs)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/
  - tests/distributed/test_utils
@@ -103,8 +103,8 @@ steps:

 - label: Distributed Tests (8 GPUs)(H100)
  timeout_in_minutes: 10
-  gpu: h100
-  num_gpus: 8
+  device: h100
+  num_devices: 8
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - examples/offline_inference/torchrun_dp_example.py
@@ -120,9 +120,9 @@ steps:
  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep

 - label: Distributed Tests (4 GPUs)(A100)
-  gpu: a100
+  device: a100
  optional: true
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
  - vllm/
  commands:
@@ -133,26 +133,34 @@ steps:
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py

- label: Distributed Tests (2 GPUs)(H200)
-  gpu: h200
+- label: Sequence Parallel Tests (H100)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  device: h100
+  optional: true
+  num_devices: 2
+  commands:
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    # Run sequence parallel tests
+    - pytest -v -s tests/distributed/test_sequence_parallel.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+
+- label: Distributed Tests (2 GPUs)(H100)
+  device: h100
  optional: true
  working_dir: "/vllm-workspace/"
-  num_gpus: 2
+  num_devices: 2
  commands:
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
-    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
+    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py

 - label: Distributed Tests (2 GPUs)(B200)
-  gpu: b200
+  device: b200
  optional: true
  working_dir: "/vllm-workspace/"
-  num_gpus: 2
+  num_devices: 2
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
@@ -161,8 +169,9 @@ steps:
 - label: 2 Node Test (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  num_nodes: 2
+  no_plugin: true
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
@@ -176,7 +185,7 @@ steps:
 - label: Distributed NixlConnector PD accuracy (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - tests/v1/kv_connector/nixl_integration/
@@ -184,10 +193,21 @@ steps:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh

+- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_devices: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+
 - label: Pipeline + Context Parallelism (4 GPUs))
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
@@ -196,4 +216,46 @@ steps:
  - tests/distributed/
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
-  - pytest -v -s distributed/test_pipeline_parallel.py
+  - pytest -v -s distributed/test_pipeline_parallel.py
+
+- label: Hopper Fusion E2E Tests (H100)
+  timeout_in_minutes: 70
+  working_dir: "/vllm-workspace/"
+  device: h100
+  optional: true
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusion_attn.py
+  commands:
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    # skip Llama-4 since it does not fit on this device
+    - pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
+
+- label: Hopper Fusion Distributed E2E Tests (2xH100)
+  timeout_in_minutes: 70
+  working_dir: "/vllm-workspace/"
+  device: h100
+  optional: true
+  num_devices: 2
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  commands:
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    # Run all e2e fusion tests
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
--- a/.buildkite/test_areas/e2e_integration.yaml
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -4,27 +4,27 @@ depends_on:
 steps:
 - label: DeepSeek V2-Lite Accuracy
  timeout_in_minutes: 60
-  gpu: h100
+  device: h100
  optional: true
-  num_gpus: 4
+  num_devices: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010

 - label: Qwen3-30B-A3B-FP8-block Accuracy
  timeout_in_minutes: 60
-  gpu: h100
+  device: h100
  optional: true
-  num_gpus: 4
+  num_devices: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020

 - label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
  timeout_in_minutes: 60
-  gpu: b200
+  device: b200
  optional: true
-  num_gpus: 2
+  num_devices: 2
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
@@ -33,10 +33,11 @@ steps:
  timeout_in_minutes: 30
  optional: true
  soft_fail: true
-  num_gpus: 2
+  num_devices: 2
  working_dir: "/vllm-workspace"
  source_file_dependencies:
  - vllm/
  - .buildkite/scripts/run-prime-rl-test.sh
  commands:
+    - nvidia-smi
    - bash .buildkite/scripts/run-prime-rl-test.sh
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -23,4 +23,8 @@ steps:
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
    - pytest -v -s v1/e2e
-    - pytest -v -s v1/engine
+    # Run this test standalone for now;
+    # need to untangle use (implicit) use of spawn/fork across the tests.
+    - pytest -v -s v1/engine/test_preprocess_error_handling.py
+    # Run the rest of v1/engine tests
+    - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
--- a/.buildkite/test_areas/expert_parallelism.yaml
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -14,7 +14,7 @@ steps:
 - label: EPLB Execution
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/eplb
  - tests/distributed/test_eplb_execute.py
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -57,8 +57,8 @@ steps:

 - label: Kernels DeepGEMM Test (H100)
  timeout_in_minutes: 45
-  gpu: h100
-  num_gpus: 1
+  device: h100
+  num_devices: 1
  source_file_dependencies:
  - tools/install_deepgemm.sh
  - vllm/utils/deep_gemm.py
@@ -77,7 +77,7 @@ steps:
 - label: Kernels (B200)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
-  gpu: b200
+  device: b200
  # optional: true
  source_file_dependencies:
  - csrc/quantization/fp4/
@@ -114,4 +114,55 @@ steps:
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
-    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
+    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
+    # e2e
+    - pytest -v -s tests/models/quantization/test_nvfp4.py
+
+- label: Kernels Helion Test
+  timeout_in_minutes: 30
+  device: h100
+  source_file_dependencies:
+  - vllm/utils/import_utils.py
+  - tests/kernels/helion/
+  commands:
+    - pip install helion
+    - pytest -v -s kernels/helion/
+
+ 
+- label: Kernels FP8 MoE Test (1 H100)
+  timeout_in_minutes: 90
+  device: h100
+  num_devices: 1
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_cutlass_moe.py
+    - pytest -v -s kernels/moe/test_flashinfer.py
+    - pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
+    - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
+    - pytest -v -s kernels/moe/test_moe.py
+    # - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
+    - pytest -v -s kernels/moe/test_block_int8.py
+    - pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
+    - pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
+
+- label: Kernels FP8 MoE Test (2 H100s)
+  timeout_in_minutes: 90
+  device: h100
+  num_devices: 2
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
+    - pytest -v -s kernels/moe/test_deepep_moe.py
+    - pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
+    # - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
+  
+- label: Kernels Fp4 MoE Test (B200)
+  timeout_in_minutes: 60
+  device: b200
+  num_devices: 1
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_cutedsl_moe.py
+    - pytest -v -s kernels/moe/test_flashinfer_moe.py
+    - pytest -v -s kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s kernels/moe/test_ocp_mx_moe.py
--- a/.buildkite/test_areas/lm_eval.yaml
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -12,9 +12,9 @@ steps:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt

 - label: LM Eval Large Models (4 GPUs)(A100)
-  gpu: a100
+  device: a100
  optional: true
-  num_gpus: 4
+  num_devices: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
@@ -24,9 +24,9 @@ steps:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

 - label: LM Eval Large Models (4 GPUs)(H100)
-  gpu: h100
+  device: h100
  optional: true
-  num_gpus: 4
+  num_devices: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
@@ -37,10 +37,39 @@ steps:

 - label: LM Eval Small Models (B200)
  timeout_in_minutes: 120
-  gpu: b200
+  device: b200
  optional: true
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
+
+- label: LM Eval Large Models (H200)
+  timeout_in_minutes: 60
+  device: h200
+  optional: true
+  num_devices: 8
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
+
+- label: MoE Refactor Integration Test (H100 - TEMPORARY)
+  device: h100
+  optional: true
+  num_devices: 2
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
+  
+- label: MoE Refactor Integration Test (B200 - TEMPORARY)
+  gpu: b200
+  optional: true
+  num_devices: 2
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
+
+- label: MoE Refactor Integration Test (B200 DP - TEMPORARY)
+  device: b200
+  optional: true
+  num_devices: 2
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
--- a/.buildkite/test_areas/lora.yaml
+++ b/.buildkite/test_areas/lora.yaml
@@ -14,7 +14,7 @@ steps:

 - label: LoRA TP (Distributed)
  timeout_in_minutes: 30
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
  - vllm/lora
  - tests/lora
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -31,7 +31,7 @@ steps:
  source_file_dependencies:
    - vllm/
    - tests/v1
-  no_gpu: true
+  device: cpu
  commands:
    # split the test to avoid interference
    - pytest -v -s -m 'cpu_test' v1/core
@@ -82,7 +82,7 @@ steps:

 - label: Metrics, Tracing (2 GPUs)
  timeout_in_minutes: 20
-  num_gpus: 2
+  num_devices: 2
  source_file_dependencies:
  - vllm/
  - tests/v1/tracing
@@ -127,7 +127,7 @@ steps:
  - tests/tool_parsers
  - tests/transformers_utils
  - tests/config
-  no_gpu: true
+  device: cpu
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
@@ -142,7 +142,7 @@ steps:
 - label: GPT-OSS Eval (B200)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
-  gpu: b200
+  device: b200
  optional: true
  source_file_dependencies:
  - tests/evals/gpt_oss
@@ -155,7 +155,7 @@ steps:

 - label: Batch Invariance (H100)
  timeout_in_minutes: 25
-  gpu: h100
+  device: h100
  source_file_dependencies:
    - vllm/v1/attention
    - vllm/model_executor/layers
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -44,7 +44,7 @@ steps:
  - vllm/
  - tests/models/test_utils.py
  - tests/models/test_vision.py
-  no_gpu: true
+  device: cpu
  commands:
    - pytest -v -s models/test_utils.py models/test_vision.py

--- a/.buildkite/test_areas/models_distributed.yaml
+++ b/.buildkite/test_areas/models_distributed.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Distributed Model Tests (2 GPUs)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  source_file_dependencies:
  - vllm/model_executor/model_loader/sharded_state_loader.py
  - vllm/model_executor/models/
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -18,7 +18,7 @@ steps:
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
-  no_gpu: true
+  device: cpu
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
--- a/.buildkite/test_areas/plugins.yaml
+++ b/.buildkite/test_areas/plugins.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Plugin Tests (2 GPUs)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  source_file_dependencies:
  - vllm/plugins/
  - tests/plugins/
--- a/.buildkite/test_areas/quantization.yaml
+++ b/.buildkite/test_areas/quantization.yaml
@@ -16,14 +16,14 @@ steps:
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
  # TODO(jerryzh168): resolve the above comment
-  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129
  - uv pip install --system conch-triton-kernels
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py

 - label: Quantized MoE Test (B200)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
-  gpu: b200
+  device: b200
  source_file_dependencies:
  - tests/quantization/test_blackwell_moe.py
  - vllm/model_executor/models/deepseek_v2.py
--- a/.buildkite/test_areas/weight_loading.yaml
+++ b/.buildkite/test_areas/weight_loading.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Weight Loading Multiple GPU  # 33min
  timeout_in_minutes: 45
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  optional: true
  source_file_dependencies:
  - vllm/
@@ -15,8 +15,8 @@ steps:

 - label: Weight Loading Multiple GPU - Large Models # optional
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  gpu: a100
+  num_devices: 2
+  device: a100
  optional: true
  source_file_dependencies:
  - vllm/