Enabling some B200-specific tests on MI355 (#35253)

Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
Signed-off-by: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
This commit is contained in:
Alexei-V-Ivanov-AMD
2026-03-06 13:27:20 -06:00
committed by GitHub
parent f3c6c9c9d7
commit 225d1090a0
4 changed files with 89 additions and 110 deletions

View File

@@ -499,17 +499,6 @@ steps:
- pytest -v -s v1/determinism/test_batch_invariance.py
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
- label: V1 Test attention (B200) # 10min
timeout_in_minutes: 30
gpu: b200
source_file_dependencies:
- vllm/config/attention.py
- vllm/model_executor/layers/attention
- vllm/v1/attention
- tests/v1/attention
commands:
- pytest -v -s v1/attention
- label: V1 Test others (CPU) # 5 mins
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
@@ -1185,47 +1174,40 @@ steps:
# Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
- label: Blackwell Test # 21 min
timeout_in_minutes: 30
- label: Blackwell Fusion and Compile Tests # 30 min
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
gpu: b200
# optional: true
source_file_dependencies:
- csrc/quantization/fp4/
- csrc/attention/mla/
- csrc/quantization/cutlass_w8a8/moe/
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/v1/attention/backends/mla/cutlass_mla.py
- vllm/v1/attention/backends/mla/flashinfer_mla.py
- vllm/v1/attention/selector.py
- vllm/platforms/cuda.py
- vllm/v1/worker/
- vllm/v1/cudagraph_dispatcher.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/passes/test_fusion_attn.py
- tests/compile/passes/test_silu_mul_quant_fusion.py
- tests/compile/passes/distributed/test_fusion_all_reduce.py
- tests/compile/fullgraph/test_full_graph.py
commands:
- nvidia-smi
- python3 examples/offline_inference/basic/chat.py
# Attention
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
- pytest -v -s tests/kernels/attention/test_attention_selector.py
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
# Quantization
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
- pytest -v -s tests/compile/passes/test_fusion_attn.py
- pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_gpus=2 is not set
- pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
# # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# # Wrap with quotes to escape yaml
# - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
- label: Blackwell GPT-OSS Eval
timeout_in_minutes: 60
@@ -1258,16 +1240,6 @@ steps:
commands:
- pytest -s -v tests/quantization/test_blackwell_moe.py
- label: Blackwell LM Eval Small Models
timeout_in_minutes: 120
gpu: b200
optional: true # run on nightlies
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
##### 1 GPU test #####
##### multi gpus test #####
@@ -1681,16 +1653,6 @@ steps:
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
timeout_in_minutes: 60
gpu: b200
optional: true
num_gpus: 2
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental, amdproduction]
@@ -2176,19 +2138,6 @@ steps:
# TODO: Add the "V1 Test attention (MI300)" test group
- label: V1 Test attention (H100) # 10min
mirror_hardwares: [amdexperimental]
agent_pool: mi355_1
timeout_in_minutes: 30
gpu: h100
source_file_dependencies:
- vllm/config/attention.py
- vllm/model_executor/layers/attention
- vllm/v1/attention
- tests/v1/attention
commands:
- pytest -v -s v1/attention
- label: Batch Invariance Tests (H100) # 10min
mirror_hardwares: [amdexperimental]
agent_pool: mi355_1
@@ -2205,6 +2154,8 @@ steps:
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
- label: V1 Test attention (B200) # 10min
mirror_hardwares: [amdexperimental, amdmi355]
agent_pool: mi355_1
timeout_in_minutes: 30
gpu: b200
source_file_dependencies:
@@ -2829,7 +2780,9 @@ steps:
# Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
- label: Blackwell Test # 21 min
- label: Blackwell Test (MI355) # 21 min
mirror_hardwares: [amdexperimental, amdmi355]
agent_pool: mi355_1
timeout_in_minutes: 30
working_dir: "/vllm-workspace/"
gpu: b200
@@ -2848,28 +2801,28 @@ steps:
- vllm/v1/attention/selector.py
- vllm/platforms/cuda.py
commands:
- nvidia-smi
rocm-smi
- python3 examples/offline_inference/basic/chat.py
# Attention
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
- pytest -v -s tests/kernels/attention/test_attention_selector.py
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
# Quantization
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
- pytest -v -s tests/kernels/attention/test_attention_selector.py
#- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
#- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
#- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
#- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
## Quantization
#- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
#- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
#- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
#- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
#- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
#- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
#- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
#- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
#- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
#- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
#- pytest -v -s tests/kernels/moe/test_flashinfer.py
#- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
- label: Blackwell Fusion and Compile Tests # 30 min
timeout_in_minutes: 40
@@ -2939,13 +2892,15 @@ steps:
- label: Blackwell LM Eval Small Models
timeout_in_minutes: 120
mirror_hardwares: [amdexperimental, amdproduction, amdmi355]
agent_pool: mi355_2
gpu: b200
optional: true # run on nightlies
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi355.txt
##### 1 GPU test #####
##### multi gpus test #####
@@ -3328,18 +3283,9 @@ steps:
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi355_4
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200/MI355)
mirror_hardwares: [amdexperimental, amdproduction, amdmi355]
agent_pool: mi355_2
timeout_in_minutes: 60
gpu: b200
optional: true
@@ -3358,3 +3304,18 @@ steps:
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
- label: Attention Benchmarks Smoke Test (B200/MI355)
device: b200
mirror_hardwares: [amdexperimental, amdmi355]
agent_pool: mi355_2
num_gpus: 2
optional: true
working_dir: "/vllm-workspace/"
timeout_in_minutes: 10
source_file_dependencies:
- benchmarks/attention_benchmarks/
- vllm/v1/attention/
commands:
- python3 benchmarks/attention_benchmarks/benchmark.py --backends ROCM_ATTN ROCM_AITER_FA ROCM_AITER_UNIFIED_ATTN --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1