[CI] Breakup h200 tests (#30499)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2026-01-15 23:23:22 -07:00
parent 4ae77dfd42
commit 14ce524249
5 changed files with 390 additions and 296 deletions
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -3,16 +3,26 @@

 from __future__ import annotations

-import itertools
 import logging
-from collections.abc import Iterable
-from typing import Any, NamedTuple
+from typing import Any

 import pytest
 import regex as re

+from tests.compile.fusion_test_utils import (
+    CUSTOM_OPS_FP8,
+    CUSTOM_OPS_QUANT_RMS_NORM,
+    CUSTOM_OPS_RMS_NORM,
+    MODELS,
+    MODELS_FP4,
+    MODELS_FP8,
+    MODELS_GROUP_FP8,
+    Matches,
+    custom_ops_product,
+    is_blackwell,
+    run_model,
+)
 from tests.v1.attention.utils import AttentionBackendEnum
-from vllm import LLM, SamplingParams
 from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer
@@ -20,228 +30,6 @@ from vllm.utils.torch_utils import is_torch_equal_or_newer

 from ...utils import flat_product, multi_gpu_test

-is_blackwell = lambda: current_platform.is_device_capability_family(100)
-"""Are we running on Blackwell, a lot of tests depend on it"""
-
-
-class Matches(NamedTuple):
-    attention_fusion: int = 0
-    allreduce_fusion: int = 0
-    rms_quant_norm_fusion: int = 0
-    sequence_parallel: int = 0
-    async_tp: int = 0
-
-
-class ModelBackendTestCase(NamedTuple):
-    model_name: str
-    model_kwargs: dict[str, Any]
-    backend: AttentionBackendEnum
-    matches: Matches
-
-
-MODELS_FP8: list[ModelBackendTestCase] = []
-MODELS_FP4: list[ModelBackendTestCase] = []
-MODELS_GROUP_FP8: list[ModelBackendTestCase] = []
-MODELS: list[ModelBackendTestCase] = []  # tp-only
-
-if current_platform.is_cuda():
-    MODELS_FP8 = [
-        ModelBackendTestCase(
-            # Use smaller model for L40s in CI
-            model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
-            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            backend=AttentionBackendEnum.TRITON_ATTN,
-            matches=Matches(
-                attention_fusion=32,
-                allreduce_fusion=65,
-                sequence_parallel=65,
-                async_tp=128,
-            ),
-        ),
-        ModelBackendTestCase(
-            model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
-            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            # TODO FlashInfer attn broken on Hopper with kvcache=fp8:
-            # https://github.com/vllm-project/vllm/issues/28568
-            backend=AttentionBackendEnum.FLASHINFER
-            if is_blackwell()
-            else AttentionBackendEnum.TRITON_ATTN,
-            matches=Matches(
-                attention_fusion=48,
-                allreduce_fusion=96,
-                sequence_parallel=96,
-                async_tp=95,  # mlp is moe, no fusion there
-            ),
-        ),
-    ]
-
-    MODELS_FP4 = [
-        ModelBackendTestCase(
-            model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
-            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            backend=AttentionBackendEnum.FLASHINFER,
-            matches=Matches(
-                attention_fusion=32,
-                allreduce_fusion=65,
-                sequence_parallel=65,
-                async_tp=128,
-            ),
-        ),
-    ]
-
-    # TP only
-    MODELS = [
-        ModelBackendTestCase(
-            model_name="meta-llama/Llama-3.1-8B-Instruct",
-            model_kwargs=dict(max_model_len=1024),
-            backend=AttentionBackendEnum.TRITON_ATTN,
-            matches=Matches(
-                attention_fusion=0,
-                allreduce_fusion=65,
-                sequence_parallel=65,
-                async_tp=128,
-            ),
-        ),
-        ModelBackendTestCase(
-            model_name="Qwen/Qwen3-30B-A3B",
-            model_kwargs=dict(max_model_len=1024),
-            backend=AttentionBackendEnum.TRITON_ATTN,
-            matches=Matches(
-                attention_fusion=0,
-                allreduce_fusion=97,
-                sequence_parallel=97,
-                async_tp=96,  # MLP is MoE, half the fusions of dense
-            ),
-        ),
-    ]
-
-elif current_platform.is_rocm():
-    MODELS_FP8 = [
-        ModelBackendTestCase(
-            model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
-            model_kwargs=dict(max_model_len=1024),
-            backend=AttentionBackendEnum.TRITON_ATTN,
-            matches=Matches(attention_fusion=32),
-        ),
-        ModelBackendTestCase(
-            model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
-            model_kwargs=dict(max_model_len=1024),
-            backend=AttentionBackendEnum.ROCM_ATTN,
-            matches=Matches(attention_fusion=32),
-        ),
-        ModelBackendTestCase(
-            model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
-            model_kwargs=dict(max_model_len=1024),
-            backend=AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN,
-            matches=Matches(attention_fusion=32),
-        ),
-    ]
-
-CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"]
-
-
-def has_cuda_graph_wrapper_metadata() -> bool:
-    from importlib import import_module
-
-    try:
-        module = import_module("torch._inductor.utils")
-        module.CUDAGraphWrapperMetadata  # noqa B018
-    except AttributeError:
-        return False
-    return True
-
-
-@pytest.mark.parametrize(
-    "model_name, model_kwargs, backend, matches, custom_ops",
-    # Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8
-    list(flat_product(MODELS_FP8, CUSTOM_OPS_FP8))
-    # quant_fp4 only has the custom impl
-    + list(flat_product(MODELS_FP4, [""])),
-)
-@pytest.mark.parametrize(
-    "inductor_graph_partition",
-    [
-        pytest.param(
-            True,
-            marks=pytest.mark.skipif(
-                not has_cuda_graph_wrapper_metadata(),
-                reason="This test requires"
-                "torch._inductor.utils.CUDAGraphWrapperMetadata to run",
-            ),
-        ),
-        False,
-    ],
-)
-def test_attn_quant(
-    model_name: str,
-    model_kwargs: dict[str, Any],
-    backend: AttentionBackendEnum,
-    matches: Matches,
-    custom_ops: str,
-    inductor_graph_partition: bool,
-    caplog_mp_spawn,
-    monkeypatch,
-):
-    if backend == AttentionBackendEnum.FLASHINFER and (
-        not is_blackwell() or not has_flashinfer()
-    ):
-        pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer")
-    if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
-        pytest.skip("Inductor graph partition requires torch>=2.9")
-
-    custom_ops_list = custom_ops.split(",") if custom_ops else []
-
-    if inductor_graph_partition:
-        mode = CUDAGraphMode.FULL_AND_PIECEWISE
-        splitting_ops: list[str] | None = None
-    else:
-        # FIXME: Llama-4-Scout-17B-16E-Instruct-FP8 + FlashInfer + Blackwell end at
-        # CUDAGraphMode.NONE here because it derives an attention backend that
-        # does not support full cudagraphs
-        mode = CUDAGraphMode.FULL_DECODE_ONLY
-        splitting_ops = []
-
-    # Disable, compile cache to make sure custom passes run.
-    # Otherwise, we can't verify fusion happened through the logs.
-    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
-
-    # To capture subprocess logs, we need to know whether spawn or fork is used.
-    # Force spawn as it is more general.
-    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-
-    model_kwargs["attention_config"] = {"backend": backend.name}
-
-    compilation_config = CompilationConfig(
-        # Testing properties
-        custom_ops=custom_ops_list,
-        use_inductor_graph_partition=inductor_graph_partition,
-        cudagraph_mode=mode,
-        splitting_ops=splitting_ops,
-        # Common
-        mode=CompilationMode.VLLM_COMPILE,
-        pass_config=PassConfig(fuse_attn_quant=True, eliminate_noops=True),
-        # Inductor caches custom passes by default as well via uuid
-        inductor_compile_config={"force_disable_caches": True},
-    )
-
-    with caplog_mp_spawn(logging.DEBUG) as log_holder:
-        run_model(compilation_config, model_name, **model_kwargs)
-
-    log_matches = re.findall(
-        r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
-        log_holder.text,
-    )
-    assert len(log_matches) == 1, log_holder.text
-    assert int(log_matches[0]) == matches.attention_fusion
-
-
-CUSTOM_OPS_RMS_NORM = ["-rms_norm", "+rms_norm"]
-
-
-def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]:
-    for op_list in itertools.product(*custom_ops_lists):
-        yield ",".join(op_list)
-

@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize(
@@ -421,7 +209,7 @@ def test_tp2_attn_quant_async_tp(
        custom_ops=custom_ops_list,
        splitting_ops=splitting_ops,
        # Common
-        level=CompilationMode.VLLM_COMPILE,
+        mode=CompilationMode.VLLM_COMPILE,
        pass_config=PassConfig(
            fuse_attn_quant=True,
            eliminate_noops=True,
@@ -464,62 +252,6 @@ def test_tp2_attn_quant_async_tp(
    assert int(log_matches[1]) == matches.async_tp


-def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):
-    compilation_config = (
-        compile_config
-        if isinstance(compile_config, CompilationConfig)
-        else CompilationConfig(mode=compile_config)
-    )
-
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(temperature=0)
-    # Allow override from model_kwargs
-    model_kwargs = {"tensor_parallel_size": 1, **model_kwargs}
-    model_kwargs = {"disable_custom_all_reduce": True, **model_kwargs}
-
-    # No cudagraphs by default
-    if compilation_config.cudagraph_mode is None:
-        compilation_config.cudagraph_mode = CUDAGraphMode.NONE
-    llm = LLM(
-        model=model,
-        compilation_config=compilation_config,
-        **model_kwargs,
-    )
-    outputs = llm.generate(prompts, sampling_params)
-
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-    # Get the compile ranges split points after vllm config post init
-    # in order to compute compile ranges correctly
-    compilation_config.compile_ranges_split_points = (
-        llm.llm_engine.vllm_config.compilation_config.compile_ranges_split_points
-    )
-
-
-if current_platform.is_cuda():
-    MODELS_GROUP_FP8 = [
-        ModelBackendTestCase(
-            model_name="Qwen/Qwen3-30B-A3B-FP8",
-            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            backend=AttentionBackendEnum.TRITON_ATTN,
-            matches=Matches(
-                rms_quant_norm_fusion=48,
-            ),
-        ),
-    ]
-
-CUSTOM_OPS_QUANT_RMS_NORM = ["+quant_fp8,+rms_norm"]
-
-
@pytest.mark.parametrize(
    "model_name, model_kwargs, backend, matches, custom_ops",
    # Test rms norm+group quant_fp8 fusion