.buildkite/test_areas/compile.yaml

group: Compile
depends_on: 
  - image-build
steps:
- label: Sequence Parallel Correctness Tests (2 GPUs)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/"
  num_devices: 2
  source_file_dependencies:
  - vllm/model_executor/layers/
  - vllm/compilation/
  - vllm/v1/worker/
  - vllm/v1/cudagraph_dispatcher.py
  - tests/compile/correctness_e2e/test_sequence_parallel.py
  commands:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
  - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py

- label: Sequence Parallel Correctness Tests (2xH100)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/"
  device: h100
  optional: true
  num_devices: 2
  commands:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
  - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py

- label: AsyncTP Correctness Tests (2xH100)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/"
  device: h100
  optional: true
  num_devices: 2
  commands:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py

- label: Distributed Compile Unit Tests (2xH100)
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/"
  device: h100
  num_devices: 2
  source_file_dependencies:
  - vllm/compilation/
  - vllm/model_executor/layers
  - tests/compile/passes/distributed/
  commands:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
  - pytest -s -v tests/compile/passes/distributed

- label: Fusion and Compile Unit Tests (B200)
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/"
  device: b200
  source_file_dependencies:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/attention/attention.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
  - tests/compile/passes/test_fusion_attn.py
  - tests/compile/passes/test_silu_mul_quant_fusion.py
  - tests/compile/passes/distributed/test_fusion_all_reduce.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
    # b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
    - nvidia-smi
    - pytest -v -s tests/compile/passes/test_fusion_attn.py -k FLASHINFER
    - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_devices=2 is not set
    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
    # TODO(luka) move to H100 once pass tests run on H100
    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

- label: Fusion E2E Quick (H100)
  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/"
  device: h100
  num_devices: 1
  source_file_dependencies:
    - csrc/quantization/
    - vllm/model_executor/
    - vllm/v1/attention/
    - vllm/compilation/
    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
    # Run all models and attn backends but only Inductor partition and native custom ops
    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"

- label: Fusion E2E Config Sweep (H100)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
  device: h100
  num_devices: 1
  source_file_dependencies:
    - csrc/quantization/
    - vllm/compilation/
    # can affect pattern matching
    - vllm/model_executor/layers/layernorm.py
    - vllm/model_executor/layers/activation.py
    - vllm/model_executor/layers/attention/attention.py
    - vllm/model_executor/layers/quantization/input_quant_fp8.py
    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
    # Run just llama3 (fp8) for all config combinations
    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"

- label: Fusion E2E Config Sweep (B200)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
  device: b200
  num_devices: 1
  optional: true
  commands:
    - nvidia-smi
    # Run all models but only FLASHINFER, Inductor partition and native custom ops
    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
    # Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3) or llama-3)"

- label: Fusion E2E TP2 Quick (H100)
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/"
  device: h100
  num_devices: 2
  source_file_dependencies:
    - csrc/quantization/
    - vllm/model_executor/
    - vllm/v1/attention/
    - vllm/compilation/
    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
    # Run all models and attn backends but only Inductor partition and native custom ops
    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"

- label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
  device: h100
  num_devices: 2
  source_file_dependencies:
    - csrc/quantization/
    - vllm/compilation/
    # can affect pattern matching
    - vllm/model_executor/layers/layernorm.py
    - vllm/model_executor/layers/activation.py
    - vllm/model_executor/layers/attention/attention.py
    - vllm/model_executor/layers/quantization/input_quant_fp8.py
    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
    # Run just llama3 (fp8 & bf16) for all config combinations
    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"

- label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
  device: h100
  num_devices: 2
  source_file_dependencies:
    - csrc/quantization/
    - vllm/compilation/
    # can affect pattern matching
    - vllm/model_executor/layers/layernorm.py
    - vllm/model_executor/layers/activation.py
    - vllm/model_executor/layers/attention/attention.py
    - vllm/model_executor/layers/quantization/input_quant_fp8.py
    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
    # Run just llama3 (fp8 & bf16) for all config combinations
    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"

- label: Fusion E2E TP2 (B200)
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/"
  device: b200
  num_devices: 2
  source_file_dependencies:
    - csrc/quantization/
    - vllm/model_executor/
    - vllm/v1/attention/
    - vllm/compilation/
    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
    # Run all models but only FLASHINFER, Inductor partition and native custom ops
    # include qwen with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
    # for ar-rms-quant-fp4, also sweep llama3
    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)) or Llama-3.1-8B-Instruct-FP4"
    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)"
[ci] Refactor CI file structure (#29343) 2025-12-08 18:25:43 -08:00			`group: Compile`
			`depends_on:`
			`- image-build`
			`steps:`
[torch.compile] Reorganize vllm/compilation and tests/compile (0/N for vLLM IR) (#33731) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-06 07:19:49 -05:00			`- label: Sequence Parallel Correctness Tests (2 GPUs)`
[CI][torch.compile] Reduce e2e fusion test time (#33293) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-04 19:09:03 -05:00			`timeout_in_minutes: 50`
[ci] Refactor CI file structure (#29343) 2025-12-08 18:25:43 -08:00			`working_dir: "/vllm-workspace/"`
[CI][torch.compile] Reduce e2e fusion test time (#33293) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-04 19:09:03 -05:00			`num_devices: 2`
[ci] Refactor CI file structure (#29343) 2025-12-08 18:25:43 -08:00			`source_file_dependencies:`
[CI][torch.compile] Reduce e2e fusion test time (#33293) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-04 19:09:03 -05:00			`- vllm/model_executor/layers/`
			`- vllm/compilation/`
[ci] Refactor CI file structure (#29343) 2025-12-08 18:25:43 -08:00			`- vllm/v1/worker/`
			`- vllm/v1/cudagraph_dispatcher.py`
[torch.compile] Reorganize vllm/compilation and tests/compile (0/N for vLLM IR) (#33731) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-06 07:19:49 -05:00			`- tests/compile/correctness_e2e/test_sequence_parallel.py`
[CI][torch.compile] Reduce e2e fusion test time (#33293) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-04 19:09:03 -05:00			`commands:`
			`- export VLLM_TEST_CLEAN_GPU_MEMORY=1`
[torch.compile] Reorganize vllm/compilation and tests/compile (0/N for vLLM IR) (#33731) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-06 07:19:49 -05:00			`- pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py`
[CI][torch.compile] Reduce e2e fusion test time (#33293) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-04 19:09:03 -05:00
[torch.compile] Reorganize vllm/compilation and tests/compile (0/N for vLLM IR) (#33731) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-06 07:19:49 -05:00			`- label: Sequence Parallel Correctness Tests (2xH100)`
[CI][torch.compile] Reduce e2e fusion test time (#33293) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-04 19:09:03 -05:00			`timeout_in_minutes: 50`
			`working_dir: "/vllm-workspace/"`
			`device: h100`
			`optional: true`
			`num_devices: 2`
			`commands:`
			`- export VLLM_TEST_CLEAN_GPU_MEMORY=1`
[torch.compile] Reorganize vllm/compilation and tests/compile (0/N for vLLM IR) (#33731) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-06 07:19:49 -05:00			`- pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py`

			`- label: AsyncTP Correctness Tests (2xH100)`
			`timeout_in_minutes: 50`
			`working_dir: "/vllm-workspace/"`
			`device: h100`
			`optional: true`
			`num_devices: 2`
			`commands:`
			`- export VLLM_TEST_CLEAN_GPU_MEMORY=1`
			`- pytest -v -s tests/compile/correctness_e2e/test_async_tp.py`
[CI][torch.compile] Reduce e2e fusion test time (#33293) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-04 19:09:03 -05:00
			`- label: Distributed Compile Unit Tests (2xH100)`
[torch.compile] Reorganize vllm/compilation and tests/compile (0/N for vLLM IR) (#33731) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-06 07:19:49 -05:00			`timeout_in_minutes: 20`
[CI][torch.compile] Reduce e2e fusion test time (#33293) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-04 19:09:03 -05:00			`working_dir: "/vllm-workspace/"`
			`device: h100`
			`num_devices: 2`
			`source_file_dependencies:`
[ci] Refactor CI file structure (#29343) 2025-12-08 18:25:43 -08:00			`- vllm/compilation/`
[CI][torch.compile] Reduce e2e fusion test time (#33293) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-04 19:09:03 -05:00			`- vllm/model_executor/layers`
[torch.compile] Reorganize vllm/compilation and tests/compile (0/N for vLLM IR) (#33731) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-06 07:19:49 -05:00			`- tests/compile/passes/distributed/`
[CI][torch.compile] Reduce e2e fusion test time (#33293) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-04 19:09:03 -05:00			`commands:`
			`- export VLLM_TEST_CLEAN_GPU_MEMORY=1`
[torch.compile] Reorganize vllm/compilation and tests/compile (0/N for vLLM IR) (#33731) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-06 07:19:49 -05:00			`- pytest -s -v tests/compile/passes/distributed`
[CI][torch.compile] Reduce e2e fusion test time (#33293) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-04 19:09:03 -05:00
			`- label: Fusion and Compile Unit Tests (B200)`
			`timeout_in_minutes: 20`
			`working_dir: "/vllm-workspace/"`
			`device: b200`
			`source_file_dependencies:`
			`- csrc/quantization/fp4/`
			`- vllm/model_executor/layers/quantization/`
[ci] Refactor CI file structure (#29343) 2025-12-08 18:25:43 -08:00			`- vllm/model_executor/layers/layernorm.py`
			`- vllm/model_executor/layers/activation.py`
[CI][torch.compile] Reduce e2e fusion test time (#33293) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-04 19:09:03 -05:00			`- vllm/model_executor/layers/attention/attention.py`
			`- vllm/v1/attention/backends/flashinfer.py`
			`- vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes`
[torch.compile] Reorganize vllm/compilation and tests/compile (0/N for vLLM IR) (#33731) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-06 07:19:49 -05:00			`- tests/compile/passes/test_fusion_attn.py`
			`- tests/compile/passes/test_silu_mul_quant_fusion.py`
			`- tests/compile/passes/distributed/test_fusion_all_reduce.py`
[ci] Refactor CI file structure (#29343) 2025-12-08 18:25:43 -08:00			`- tests/compile/fullgraph/test_full_graph.py`
			`commands:`
[CI][torch.compile] Reduce e2e fusion test time (#33293) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-04 19:09:03 -05:00			`# b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell`
[ci] Refactor CI file structure (#29343) 2025-12-08 18:25:43 -08:00			`- nvidia-smi`
[torch.compile] Reorganize vllm/compilation and tests/compile (0/N for vLLM IR) (#33731) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-06 07:19:49 -05:00			`- pytest -v -s tests/compile/passes/test_fusion_attn.py -k FLASHINFER`
			`- pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py`
[ci] Sync test areas with test-pipeline.yaml and enable new pipeline generator (#33080) Signed-off-by: Kevin H. Luu <khluu000@gmail.com> Signed-off-by: khluu <khluu000@gmail.com> Co-authored-by: Kevin Luu <khluu@Kevins-MacBook-Pro.local> 2026-01-26 12:28:20 -08:00			`# this runner has 2 GPUs available even though num_devices=2 is not set`
[torch.compile] Reorganize vllm/compilation and tests/compile (0/N for vLLM IR) (#33731) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-06 07:19:49 -05:00			`- pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py`
[ci] Refactor CI file structure (#29343) 2025-12-08 18:25:43 -08:00			`# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)`
[CI][torch.compile] Reduce e2e fusion test time (#33293) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-04 19:09:03 -05:00			`# TODO(luka) move to H100 once pass tests run on H100`
[ci] Refactor CI file structure (#29343) 2025-12-08 18:25:43 -08:00			`- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile`

[CI][torch.compile] Reduce e2e fusion test time (#33293) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-04 19:09:03 -05:00			`- label: Fusion E2E Quick (H100)`
			`timeout_in_minutes: 15`
			`working_dir: "/vllm-workspace/"`
			`device: h100`
			`num_devices: 1`
			`source_file_dependencies:`
			`- csrc/quantization/`
			`- vllm/model_executor/`
			`- vllm/v1/attention/`
			`- vllm/compilation/`
			`- tests/compile/fusions_e2e/`
			`commands:`
			`- nvidia-smi`
			`# Run all models and attn backends but only Inductor partition and native custom ops`
			`- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"`
			`# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported`
			`- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"`

			`- label: Fusion E2E Config Sweep (H100)`
			`timeout_in_minutes: 30`
			`working_dir: "/vllm-workspace/"`
			`device: h100`
			`num_devices: 1`
			`source_file_dependencies:`
			`- csrc/quantization/`
			`- vllm/compilation/`
			`# can affect pattern matching`
			`- vllm/model_executor/layers/layernorm.py`
			`- vllm/model_executor/layers/activation.py`
			`- vllm/model_executor/layers/attention/attention.py`
			`- vllm/model_executor/layers/quantization/input_quant_fp8.py`
			`- tests/compile/fusions_e2e/`
			`commands:`
			`- nvidia-smi`
			`# Run just llama3 (fp8) for all config combinations`
			`- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"`

			`- label: Fusion E2E Config Sweep (B200)`
			`timeout_in_minutes: 30`
[ci] Refactor CI file structure (#29343) 2025-12-08 18:25:43 -08:00			`working_dir: "/vllm-workspace/"`
[ci] Sync test areas with test-pipeline.yaml and enable new pipeline generator (#33080) Signed-off-by: Kevin H. Luu <khluu000@gmail.com> Signed-off-by: khluu <khluu000@gmail.com> Co-authored-by: Kevin Luu <khluu@Kevins-MacBook-Pro.local> 2026-01-26 12:28:20 -08:00			`device: b200`
[CI][torch.compile] Reduce e2e fusion test time (#33293) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-04 19:09:03 -05:00			`num_devices: 1`
[ci] Refactor CI file structure (#29343) 2025-12-08 18:25:43 -08:00			`optional: true`
[CI][torch.compile] Reduce e2e fusion test time (#33293) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-04 19:09:03 -05:00			`commands:`
			`- nvidia-smi`
[CI][torch.compile] Fix incorrect filtering for E2E fusion tests on B200 (#34031) Signed-off-by: Luka Govedič <lgovedic@redhat.com> 2026-02-09 10:05:14 -05:00			`# Run all models but only FLASHINFER, Inductor partition and native custom ops`
[CI][torch.compile] Reduce e2e fusion test time (#33293) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-04 19:09:03 -05:00			`# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported`
[CI][torch.compile] Fix incorrect filtering for E2E fusion tests on B200 (#34031) Signed-off-by: Luka Govedič <lgovedic@redhat.com> 2026-02-09 10:05:14 -05:00			`# Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)`
			`- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3) or llama-3)"`
[CI][torch.compile] Reduce e2e fusion test time (#33293) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-04 19:09:03 -05:00
			`- label: Fusion E2E TP2 Quick (H100)`
			`timeout_in_minutes: 20`
			`working_dir: "/vllm-workspace/"`
			`device: h100`
[ci] Sync test areas with test-pipeline.yaml and enable new pipeline generator (#33080) Signed-off-by: Kevin H. Luu <khluu000@gmail.com> Signed-off-by: khluu <khluu000@gmail.com> Co-authored-by: Kevin Luu <khluu@Kevins-MacBook-Pro.local> 2026-01-26 12:28:20 -08:00			`num_devices: 2`
[ci] Refactor CI file structure (#29343) 2025-12-08 18:25:43 -08:00			`source_file_dependencies:`
[CI][torch.compile] Reduce e2e fusion test time (#33293) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-04 19:09:03 -05:00			`- csrc/quantization/`
			`- vllm/model_executor/`
			`- vllm/v1/attention/`
			`- vllm/compilation/`
			`- tests/compile/fusions_e2e/`
			`commands:`
			`- nvidia-smi`
			`# Run all models and attn backends but only Inductor partition and native custom ops`
			`- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"`
			`- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"`

			`- label: Fusion E2E TP2 AR-RMS Config Sweep (H100)`
			`timeout_in_minutes: 40`
			`working_dir: "/vllm-workspace/"`
			`device: h100`
			`num_devices: 2`
			`source_file_dependencies:`
			`- csrc/quantization/`
			`- vllm/compilation/`
			`# can affect pattern matching`
			`- vllm/model_executor/layers/layernorm.py`
			`- vllm/model_executor/layers/activation.py`
			`- vllm/model_executor/layers/attention/attention.py`
			`- vllm/model_executor/layers/quantization/input_quant_fp8.py`
			`- tests/compile/fusions_e2e/`
			`commands:`
			`- nvidia-smi`
[CI][torch.compile] Fix incorrect filtering for E2E fusion tests on B200 (#34031) Signed-off-by: Luka Govedič <lgovedic@redhat.com> 2026-02-09 10:05:14 -05:00			`# Run just llama3 (fp8 & bf16) for all config combinations`
[CI][torch.compile] Reduce e2e fusion test time (#33293) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-04 19:09:03 -05:00			`- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"`

			`- label: Fusion E2E TP2 AsyncTP Config Sweep (H100)`
			`timeout_in_minutes: 40`
			`working_dir: "/vllm-workspace/"`
			`device: h100`
			`num_devices: 2`
			`source_file_dependencies:`
			`- csrc/quantization/`
			`- vllm/compilation/`
			`# can affect pattern matching`
			`- vllm/model_executor/layers/layernorm.py`
			`- vllm/model_executor/layers/activation.py`
			`- vllm/model_executor/layers/attention/attention.py`
			`- vllm/model_executor/layers/quantization/input_quant_fp8.py`
			`- tests/compile/fusions_e2e/`
[ci] Refactor CI file structure (#29343) 2025-12-08 18:25:43 -08:00			`commands:`
			`- nvidia-smi`
[CI][torch.compile] Reduce e2e fusion test time (#33293) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-04 19:09:03 -05:00			`# Run just llama3 (fp8 & bf16) for all config combinations`
			`- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"`
[ci] Refactor CI file structure (#29343) 2025-12-08 18:25:43 -08:00
[CI][torch.compile] Reduce e2e fusion test time (#33293) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-04 19:09:03 -05:00			`- label: Fusion E2E TP2 (B200)`
			`timeout_in_minutes: 20`
			`working_dir: "/vllm-workspace/"`
			`device: b200`
			`num_devices: 2`
			`source_file_dependencies:`
			`- csrc/quantization/`
			`- vllm/model_executor/`
			`- vllm/v1/attention/`
			`- vllm/compilation/`
			`- tests/compile/fusions_e2e/`
			`commands:`
			`- nvidia-smi`
[CI][torch.compile] Fix incorrect filtering for E2E fusion tests on B200 (#34031) Signed-off-by: Luka Govedič <lgovedic@redhat.com> 2026-02-09 10:05:14 -05:00			`# Run all models but only FLASHINFER, Inductor partition and native custom ops`
			`# include qwen with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported`
[CI][torch.compile] Reduce e2e fusion test time (#33293) Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> 2026-02-04 19:09:03 -05:00			`# for ar-rms-quant-fp4, also sweep llama3`
[CI][torch.compile] Fix incorrect filtering for E2E fusion tests on B200 (#34031) Signed-off-by: Luka Govedič <lgovedic@redhat.com> 2026-02-09 10:05:14 -05:00			`- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)) or Llama-3.1-8B-Instruct-FP4"`
			`- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)"`