Signed-off-by: LopezCastroRoberto <rocastro@redhat.com> Signed-off-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com> Co-authored-by: Claude Sonnet 4.5 <noreply@anthropic.com> Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
202 lines
6.8 KiB
YAML
202 lines
6.8 KiB
YAML
group: Kernels
|
|
depends_on:
|
|
- image-build
|
|
steps:
|
|
- label: vLLM IR Tests
|
|
timeout_in_minutes: 10
|
|
device: h200_18gb
|
|
working_dir: "/vllm-workspace/"
|
|
source_file_dependencies:
|
|
- vllm/ir
|
|
- vllm/kernels
|
|
commands:
|
|
- pytest -v -s tests/ir
|
|
- pytest -v -s tests/kernels/ir
|
|
|
|
- label: Kernels Core Operation Test
|
|
timeout_in_minutes: 75
|
|
source_file_dependencies:
|
|
- csrc/
|
|
- tests/kernels/core
|
|
- tests/kernels/test_concat_mla_q.py
|
|
commands:
|
|
- pytest -v -s kernels/core kernels/test_concat_mla_q.py
|
|
|
|
- label: Kernels Attention Test %N
|
|
timeout_in_minutes: 35
|
|
source_file_dependencies:
|
|
- csrc/attention/
|
|
- vllm/v1/attention
|
|
# TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
|
|
- vllm/model_executor/layers/attention
|
|
- vllm/utils/flashinfer.py
|
|
- tests/kernels/attention
|
|
commands:
|
|
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
|
parallelism: 2
|
|
|
|
- label: Kernels Quantization Test %N
|
|
timeout_in_minutes: 90
|
|
source_file_dependencies:
|
|
- csrc/quantization/
|
|
- vllm/model_executor/layers/quantization
|
|
- tests/kernels/quantization
|
|
commands:
|
|
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
|
parallelism: 2
|
|
|
|
- label: Kernels MoE Test %N
|
|
timeout_in_minutes: 25
|
|
source_file_dependencies:
|
|
- csrc/quantization/cutlass_w8a8/moe/
|
|
- csrc/moe/
|
|
- tests/kernels/moe
|
|
- vllm/model_executor/layers/fused_moe/
|
|
- vllm/distributed/device_communicators/
|
|
- vllm/envs.py
|
|
- vllm/config
|
|
commands:
|
|
- pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
|
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
|
parallelism: 5
|
|
|
|
- label: Kernels Mamba Test
|
|
timeout_in_minutes: 45
|
|
source_file_dependencies:
|
|
- csrc/mamba/
|
|
- tests/kernels/mamba
|
|
- vllm/model_executor/layers/mamba/ops
|
|
commands:
|
|
- pytest -v -s kernels/mamba
|
|
|
|
- label: Kernels DeepGEMM Test (H100)
|
|
timeout_in_minutes: 45
|
|
device: h100
|
|
num_devices: 1
|
|
source_file_dependencies:
|
|
- tools/install_deepgemm.sh
|
|
- vllm/utils/deep_gemm.py
|
|
- vllm/model_executor/layers/fused_moe
|
|
- vllm/model_executor/layers/quantization
|
|
- tests/kernels/quantization/test_block_fp8.py
|
|
- tests/kernels/moe/test_deepgemm.py
|
|
- tests/kernels/moe/test_batched_deepgemm.py
|
|
- tests/kernels/attention/test_deepgemm_attention.py
|
|
commands:
|
|
- pytest -v -s kernels/quantization/test_block_fp8.py
|
|
- pytest -v -s kernels/moe/test_deepgemm.py
|
|
- pytest -v -s kernels/moe/test_batched_deepgemm.py
|
|
- pytest -v -s kernels/attention/test_deepgemm_attention.py
|
|
|
|
- label: Kernels (B200)
|
|
timeout_in_minutes: 30
|
|
working_dir: "/vllm-workspace/"
|
|
device: b200
|
|
# optional: true
|
|
source_file_dependencies:
|
|
- csrc/quantization/fp4/
|
|
- csrc/attention/mla/
|
|
- csrc/quantization/cutlass_w8a8/moe/
|
|
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
|
|
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
|
|
- vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
|
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
|
- vllm/v1/attention/backends/flashinfer.py
|
|
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
|
- vllm/v1/attention/backends/mla/flashinfer_mla.py
|
|
- vllm/v1/attention/selector.py
|
|
- vllm/platforms/cuda.py
|
|
- tests/kernels/test_top_k_per_row.py
|
|
commands:
|
|
- nvidia-smi
|
|
- python3 examples/basic/offline_inference/chat.py
|
|
# Attention
|
|
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
|
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
|
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
|
|
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
|
|
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
|
|
- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
|
|
- pytest -v -s tests/kernels/test_top_k_per_row.py
|
|
# Quantization
|
|
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
|
|
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
|
|
- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
|
|
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
|
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
|
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
|
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
|
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
|
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
|
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
|
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
|
- pytest -v -s tests/kernels/moe/test_flashinfer_moe.py
|
|
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
|
# e2e
|
|
- pytest -v -s tests/models/quantization/test_nvfp4.py
|
|
|
|
- label: Kernels Helion Test
|
|
timeout_in_minutes: 30
|
|
device: h100
|
|
source_file_dependencies:
|
|
- vllm/utils/import_utils.py
|
|
- tests/kernels/helion/
|
|
commands:
|
|
- pip install helion==0.3.3
|
|
- pytest -v -s kernels/helion/
|
|
|
|
|
|
- label: Kernels FP8 MoE Test (1 H100)
|
|
timeout_in_minutes: 90
|
|
device: h100
|
|
num_devices: 1
|
|
optional: true
|
|
commands:
|
|
- pytest -v -s kernels/moe/test_cutlass_moe.py
|
|
- pytest -v -s kernels/moe/test_flashinfer.py
|
|
- pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
|
|
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
|
|
- pytest -v -s kernels/moe/test_moe.py
|
|
# - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
|
|
- pytest -v -s kernels/moe/test_block_int8.py
|
|
- pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
|
|
- pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
|
|
|
|
- label: Kernels FP8 MoE Test (2 H100s)
|
|
timeout_in_minutes: 90
|
|
device: h100
|
|
num_devices: 2
|
|
optional: true
|
|
commands:
|
|
- pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
|
|
- pytest -v -s kernels/moe/test_deepep_moe.py
|
|
|
|
- label: Kernels Fp4 MoE Test (B200)
|
|
timeout_in_minutes: 60
|
|
device: b200
|
|
num_devices: 1
|
|
optional: true
|
|
commands:
|
|
- pytest -v -s kernels/moe/test_cutedsl_moe.py
|
|
- pytest -v -s kernels/moe/test_flashinfer_moe.py
|
|
- pytest -v -s kernels/moe/test_nvfp4_moe.py
|
|
- pytest -v -s kernels/moe/test_ocp_mx_moe.py
|
|
|
|
|
|
- label: Kernels FusedMoE Layer Test (2 H100s)
|
|
timeout_in_minutes: 90
|
|
device: h100
|
|
num_devices: 2
|
|
optional: true
|
|
commands:
|
|
- pytest -v -s kernels/moe/test_moe_layer.py
|
|
|
|
|
|
- label: Kernels FusedMoE Layer Test (2 B200s)
|
|
timeout_in_minutes: 90
|
|
device: b200
|
|
num_devices: 2
|
|
optional: true
|
|
commands:
|
|
- pytest -v -s kernels/moe/test_moe_layer.py
|