[CI][torch.compile] Reduce e2e fusion test time (#33293)
Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
This commit is contained in:
0
tests/compile/fusions_e2e/__init__.py
Normal file
0
tests/compile/fusions_e2e/__init__.py
Normal file
102
tests/compile/fusions_e2e/common.py
Normal file
102
tests/compile/fusions_e2e/common.py
Normal file
@@ -0,0 +1,102 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import itertools
|
||||
from collections.abc import Callable, Iterable
|
||||
from typing import Any, NamedTuple
|
||||
|
||||
import pytest
|
||||
import regex as re
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.v1.attention.backends.registry import AttentionBackendEnum
|
||||
|
||||
|
||||
class Matches(NamedTuple):
|
||||
# simple pointwise
|
||||
rms_quant_fusion: int = 0
|
||||
act_quant_fusion: int = 0
|
||||
norm_rope_fusion: int = 0
|
||||
attn_quant_fusion: int = 0
|
||||
# distributed
|
||||
ar_rms_fusion: int = 0
|
||||
sequence_parallel: int = 0
|
||||
async_tp: int = 0
|
||||
|
||||
|
||||
class ModelFusionInfo(NamedTuple):
|
||||
model_name: str
|
||||
matches: Callable[[int], Matches]
|
||||
"""Given number of hidden layers, produces the matches object"""
|
||||
model_kwargs: dict[str, Any] = {}
|
||||
hf_overrides: Callable[[int], dict] = lambda n: {"num_hidden_layers": n}
|
||||
|
||||
|
||||
class AttentionBackendCase(NamedTuple):
|
||||
backend: AttentionBackendEnum
|
||||
model_kwargs: dict[str, Any] = {}
|
||||
"""Additional args required for attn+quant fusion"""
|
||||
|
||||
|
||||
is_blackwell = lambda: current_platform.is_device_capability_family(100)
|
||||
"""Are we running on Blackwell, a lot of tests depend on it"""
|
||||
|
||||
|
||||
def custom_ops_combos(*custom_ops: str) -> Iterable[str]:
|
||||
"""Generate all combinations of custom ops for parametrization."""
|
||||
custom_ops_lists = [[f"-{op}", f"+{op}"] for op in custom_ops]
|
||||
for op_list in itertools.product(*custom_ops_lists):
|
||||
yield ",".join(op_list)
|
||||
|
||||
|
||||
# Quick inline validation
|
||||
assert list(custom_ops_combos("silu_and_mul")) == ["-silu_and_mul", "+silu_and_mul"]
|
||||
assert list(custom_ops_combos("quant_fp8", "rms_norm")) == [
|
||||
"-quant_fp8,-rms_norm",
|
||||
"-quant_fp8,+rms_norm",
|
||||
"+quant_fp8,-rms_norm",
|
||||
"+quant_fp8,+rms_norm",
|
||||
]
|
||||
|
||||
|
||||
def has_cuda_graph_wrapper_metadata() -> bool:
|
||||
from importlib import import_module
|
||||
|
||||
try:
|
||||
module = import_module("torch._inductor.utils")
|
||||
module.CUDAGraphWrapperMetadata # noqa B018
|
||||
except AttributeError:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
INDUCTOR_GRAPH_PARTITION = [
|
||||
pytest.param(
|
||||
True,
|
||||
marks=pytest.mark.skipif(
|
||||
not has_cuda_graph_wrapper_metadata(),
|
||||
reason="torch version does not support Inductor partition",
|
||||
),
|
||||
id="inductor_partition",
|
||||
),
|
||||
pytest.param(False, id="dynamo_partition"),
|
||||
]
|
||||
|
||||
FUSION_LOG_PATTERNS: dict[str, re.Pattern] = {
|
||||
"rms_quant_fusion": re.compile(
|
||||
r"\[(?:compilation/)?fusion.py:\d+] Replaced (\d+) patterns"
|
||||
),
|
||||
"act_quant_fusion": re.compile(
|
||||
r"activation_quant_fusion.py:\d+] Replaced (\d+) patterns"
|
||||
),
|
||||
"norm_rope_fusion": re.compile(
|
||||
r"qk_norm_rope_fusion.py:\d+] Fused QK Norm\+RoPE on (\d+) sites"
|
||||
),
|
||||
"attn_quant_fusion": re.compile(
|
||||
r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes"
|
||||
),
|
||||
"ar_rms_fusion": re.compile(r"collective_fusion.py:\d+] Replaced (\d+) patterns"),
|
||||
"sequence_parallel": re.compile(
|
||||
r"sequence_parallelism.py:\d+] Replaced (\d+) patterns"
|
||||
),
|
||||
"async_tp": re.compile(r"collective_fusion.py:\d+] Replaced (\d+) patterns"),
|
||||
}
|
||||
158
tests/compile/fusions_e2e/conftest.py
Normal file
158
tests/compile/fusions_e2e/conftest.py
Normal file
@@ -0,0 +1,158 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import logging
|
||||
|
||||
import pytest
|
||||
import regex as re
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
|
||||
|
||||
from .common import FUSION_LOG_PATTERNS, AttentionBackendCase, Matches
|
||||
|
||||
|
||||
def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):
|
||||
"""Run a model with the given compilation config for E2E fusion tests."""
|
||||
compilation_config = (
|
||||
compile_config
|
||||
if isinstance(compile_config, CompilationConfig)
|
||||
else CompilationConfig(mode=compile_config)
|
||||
)
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0)
|
||||
# Allow override from model_kwargs
|
||||
model_kwargs = {"tensor_parallel_size": 1, **model_kwargs}
|
||||
model_kwargs = {"disable_custom_all_reduce": True, **model_kwargs}
|
||||
|
||||
# No cudagraphs by default
|
||||
if compilation_config.cudagraph_mode is None:
|
||||
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
llm = LLM(
|
||||
model=model,
|
||||
compilation_config=compilation_config,
|
||||
**model_kwargs,
|
||||
)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
# Get the compile ranges split points after vllm config post init
|
||||
# in order to compute compile ranges correctly
|
||||
compilation_config.compile_ranges_split_points = (
|
||||
llm.llm_engine.vllm_config.compilation_config.compile_ranges_split_points
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def run_e2e_fusion_test(monkeypatch, caplog_mp_spawn):
|
||||
def run(
|
||||
model_name: str,
|
||||
matches: Matches,
|
||||
model_kwargs: dict,
|
||||
attn_backend: AttentionBackendCase,
|
||||
compilation_config: dict,
|
||||
matches_check: list[str],
|
||||
use_deepgemm: bool = False,
|
||||
tp_size: int = 1,
|
||||
):
|
||||
monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1" if use_deepgemm else "0")
|
||||
|
||||
# Disable, compile cache to make sure custom passes run.
|
||||
# Otherwise, we can't verify fusion happened through the logs.
|
||||
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
|
||||
|
||||
# To capture subprocess logs, we need to know whether spawn or fork is used.
|
||||
# Force spawn as it is more general.
|
||||
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
|
||||
|
||||
model_kwargs = {**attn_backend.model_kwargs, **model_kwargs}
|
||||
model_kwargs["attention_config"] = {"backend": attn_backend.backend.name}
|
||||
model_kwargs["tensor_parallel_size"] = tp_size
|
||||
|
||||
# Always compile the full graph instead of piecewise
|
||||
if not compilation_config["use_inductor_graph_partition"]:
|
||||
compilation_config["splitting_ops"] = []
|
||||
|
||||
full_compilation_config = CompilationConfig(
|
||||
cudagraph_mode=CUDAGraphMode.NONE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
inductor_compile_config={"force_disable_caches": True},
|
||||
**compilation_config,
|
||||
)
|
||||
|
||||
with caplog_mp_spawn(logging.DEBUG) as log_holder:
|
||||
run_model(full_compilation_config, model_name, **model_kwargs)
|
||||
|
||||
num_compile_ranges = len(full_compilation_config.get_compile_ranges())
|
||||
assert num_compile_ranges in [1, 2]
|
||||
|
||||
print(f"Compile ranges: {full_compilation_config.get_compile_ranges()}")
|
||||
print("Fusion results:")
|
||||
|
||||
# Iterate through all so printing happens before asserting
|
||||
log_matches_dict = {}
|
||||
for match_name, pattern in FUSION_LOG_PATTERNS.items():
|
||||
log_matches_dict[match_name] = list(pattern.findall(log_holder.text))
|
||||
print(f"- {match_name}={','.join(log_matches_dict[match_name])}")
|
||||
|
||||
# Now check the matches
|
||||
for match_name in matches_check:
|
||||
num_ranges_activated = (
|
||||
1 if match_name == "ar_rms_fusion" else num_compile_ranges
|
||||
)
|
||||
n_expected = tp_size * num_ranges_activated
|
||||
|
||||
log_matches = list(int(ms) for ms in log_matches_dict[match_name])
|
||||
assert len(log_matches) == n_expected, (
|
||||
f"Could not find {n_expected} {match_name} "
|
||||
f"(found {len(log_matches)}) in:\n {log_holder.text}"
|
||||
)
|
||||
|
||||
expected_matches = getattr(matches, match_name)
|
||||
|
||||
if match_name == "rms_quant_fusion" and "ar_rms_fusion" in matches_check:
|
||||
# AR+rms+quant takes precedence over rms+quant if activated.
|
||||
# That means we get full matching where ar+rms+quant was not activated,
|
||||
# and less where it was
|
||||
assert sum(m == expected_matches for m in log_matches) == tp_size * (
|
||||
num_ranges_activated - 1
|
||||
), "Expecting full rms+quant fusion where ar+rms+quant not activated"
|
||||
|
||||
assert all(
|
||||
expected_matches - matches.ar_rms_fusion <= m <= expected_matches
|
||||
for m in log_matches
|
||||
), (
|
||||
f"Expecting at least {expected_matches - matches.ar_rms_fusion} "
|
||||
f"where ar+rms+quant was activated"
|
||||
)
|
||||
else:
|
||||
expected_matches_list = [expected_matches] * n_expected
|
||||
assert sorted(log_matches) == expected_matches_list, (
|
||||
f"{match_name} expected: {expected_matches_list}, "
|
||||
f"found: {sorted(log_matches)}"
|
||||
)
|
||||
|
||||
if match_name == "ar_rms_fusion":
|
||||
log_matches = re.findall(
|
||||
r"pass_manager.py:\d+] Skipping "
|
||||
r".*AllReduceFusionPass.* with compile range",
|
||||
log_holder.text,
|
||||
)
|
||||
|
||||
n_expected = tp_size * (num_compile_ranges - num_ranges_activated)
|
||||
assert len(log_matches) == n_expected, (
|
||||
f'Could not find {n_expected} "Skipping AllReduceFusionPass" '
|
||||
f"(found {len(log_matches)}) in:\n {log_holder.text}"
|
||||
)
|
||||
|
||||
return run
|
||||
112
tests/compile/fusions_e2e/models.py
Normal file
112
tests/compile/fusions_e2e/models.py
Normal file
@@ -0,0 +1,112 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
from vllm.utils.flashinfer import has_flashinfer
|
||||
from vllm.v1.attention.backends.registry import AttentionBackendEnum
|
||||
|
||||
from .common import AttentionBackendCase, Matches, ModelFusionInfo, is_blackwell
|
||||
|
||||
# Attn backends
|
||||
FLASHINFER_ATTN = pytest.param(
|
||||
AttentionBackendCase(
|
||||
backend=AttentionBackendEnum.FLASHINFER,
|
||||
model_kwargs=dict(kv_cache_dtype="fp8"),
|
||||
),
|
||||
id="FLASHINFER",
|
||||
marks=pytest.mark.skipif(
|
||||
not is_blackwell() or not has_flashinfer(),
|
||||
reason="FI backend requires Blackwell and FlashInfer",
|
||||
),
|
||||
)
|
||||
|
||||
TRITON_ATTN = pytest.param(
|
||||
AttentionBackendCase(backend=AttentionBackendEnum.TRITON_ATTN), id="TRITON_ATTN"
|
||||
)
|
||||
|
||||
# Models
|
||||
llama3_8b = ModelFusionInfo(
|
||||
model_name="meta-llama/Llama-3.1-8B-Instruct",
|
||||
matches=lambda n_layers: Matches(
|
||||
ar_rms_fusion=n_layers * 2 + 1,
|
||||
sequence_parallel=n_layers * 2 + 1,
|
||||
async_tp=n_layers * 4,
|
||||
),
|
||||
)
|
||||
|
||||
llama3_8b_fp8 = ModelFusionInfo(
|
||||
model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
|
||||
matches=lambda n_layers: Matches(
|
||||
rms_quant_fusion=n_layers * 2,
|
||||
act_quant_fusion=n_layers,
|
||||
attn_quant_fusion=n_layers,
|
||||
ar_rms_fusion=n_layers * 2 + 1,
|
||||
sequence_parallel=n_layers * 2 + 1,
|
||||
async_tp=n_layers * 4,
|
||||
),
|
||||
)
|
||||
|
||||
llama3_8b_fp4 = ModelFusionInfo(
|
||||
model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
|
||||
matches=lambda n_layers: Matches(
|
||||
rms_quant_fusion=0,
|
||||
act_quant_fusion=n_layers,
|
||||
attn_quant_fusion=n_layers,
|
||||
ar_rms_fusion=n_layers * 2 + 1,
|
||||
sequence_parallel=n_layers * 2 + 1,
|
||||
async_tp=n_layers * 4,
|
||||
),
|
||||
)
|
||||
|
||||
# MoEs cannot do act+quant fusion because those ops are hidden from torch.compile.
|
||||
# MoEs also only expose 1 rms+quant fusion because the quant for up_proj is hidden.
|
||||
# TODO(luka): https://github.com/vllm-project/vllm/issues/31985
|
||||
# Also, for MoEs, gemm+collective fusion only happens for dense GEMMs (o_proj/qkv proj)
|
||||
|
||||
llama4_scout_fp8 = ModelFusionInfo(
|
||||
model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
|
||||
hf_overrides=lambda n_layers: {"text_config": {"num_hidden_layers": n_layers}},
|
||||
matches=lambda n_layers: Matches(
|
||||
rms_quant_fusion=n_layers,
|
||||
attn_quant_fusion=n_layers,
|
||||
ar_rms_fusion=n_layers * 2,
|
||||
sequence_parallel=n_layers * 2,
|
||||
async_tp=n_layers * 2 - 1,
|
||||
),
|
||||
)
|
||||
|
||||
llama4_scout_fp4 = ModelFusionInfo(
|
||||
model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-NVFP4",
|
||||
hf_overrides=lambda n_layers: {"text_config": {"num_hidden_layers": n_layers}},
|
||||
matches=lambda n_layers: Matches(
|
||||
rms_quant_fusion=0,
|
||||
attn_quant_fusion=n_layers,
|
||||
ar_rms_fusion=n_layers * 2,
|
||||
sequence_parallel=n_layers * 2,
|
||||
async_tp=n_layers * 2 - 1,
|
||||
),
|
||||
)
|
||||
|
||||
qwen3_a3b = ModelFusionInfo(
|
||||
model_name="Qwen/Qwen3-30B-A3B",
|
||||
matches=lambda n_layers: Matches(
|
||||
norm_rope_fusion=n_layers,
|
||||
ar_rms_fusion=n_layers * 2 + 1,
|
||||
sequence_parallel=n_layers * 2 + 1,
|
||||
async_tp=n_layers * 2,
|
||||
),
|
||||
)
|
||||
|
||||
qwen3_a3b_fp8 = ModelFusionInfo(
|
||||
model_name="Qwen/Qwen3-30B-A3B-FP8",
|
||||
matches=lambda n_layers: Matches(
|
||||
rms_quant_fusion=n_layers,
|
||||
# TODO broken on Blackwell:
|
||||
# https://github.com/vllm-project/vllm/issues/33295
|
||||
norm_rope_fusion=0 if is_blackwell() else n_layers,
|
||||
attn_quant_fusion=0, # attn + group quant not supported
|
||||
ar_rms_fusion=n_layers * 2 + 1,
|
||||
sequence_parallel=n_layers * 2 + 1,
|
||||
async_tp=n_layers * 2,
|
||||
),
|
||||
)
|
||||
146
tests/compile/fusions_e2e/test_tp1_quant.py
Normal file
146
tests/compile/fusions_e2e/test_tp1_quant.py
Normal file
@@ -0,0 +1,146 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from collections.abc import Callable
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import PassConfig
|
||||
|
||||
from .common import (
|
||||
INDUCTOR_GRAPH_PARTITION,
|
||||
AttentionBackendCase,
|
||||
Matches,
|
||||
custom_ops_combos,
|
||||
is_blackwell,
|
||||
)
|
||||
from .models import (
|
||||
FLASHINFER_ATTN,
|
||||
TRITON_ATTN,
|
||||
llama3_8b_fp4,
|
||||
llama3_8b_fp8,
|
||||
llama4_scout_fp4,
|
||||
llama4_scout_fp8,
|
||||
qwen3_a3b_fp8,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, matches_fn, model_kwargs, hf_overrides, use_deepgemm",
|
||||
[
|
||||
(*llama3_8b_fp8, False),
|
||||
(*llama4_scout_fp8, False),
|
||||
(*qwen3_a3b_fp8, False),
|
||||
(*qwen3_a3b_fp8, True),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN])
|
||||
@pytest.mark.parametrize("n_layers", [6])
|
||||
@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm"))
|
||||
@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
|
||||
def test_tp1_fp8_fusions(
|
||||
model_name: str,
|
||||
matches_fn: Callable[[int], Matches],
|
||||
model_kwargs: dict,
|
||||
hf_overrides: Callable[[int], dict],
|
||||
attn_backend: AttentionBackendCase,
|
||||
n_layers: int,
|
||||
custom_ops: str,
|
||||
inductor_graph_partition: bool,
|
||||
use_deepgemm: bool,
|
||||
run_e2e_fusion_test,
|
||||
monkeypatch,
|
||||
):
|
||||
if use_deepgemm:
|
||||
# TODO(luka/eliza) DeepGEMM uses different quants, matching not supported
|
||||
# - on Blackwell, uses a special quant fp8, currently not supported
|
||||
# - on Hopper, tma-aligned scales inhibit matching (fix WIP)
|
||||
pytest.skip("DeepGEMM & quant matching not currently supported")
|
||||
|
||||
matches = matches_fn(n_layers)
|
||||
|
||||
if "qwen" in model_name.lower() and "-quant_fp8" in custom_ops:
|
||||
# This is why config forces +quant_fp8 by default
|
||||
pytest.skip("native QuantFP8 matching not supported for group quant")
|
||||
|
||||
# Reduce size of model and skip weight loading time
|
||||
model_kwargs["hf_overrides"] = hf_overrides(n_layers)
|
||||
model_kwargs["load_format"] = "dummy"
|
||||
model_kwargs["max_model_len"] = 1024
|
||||
|
||||
compilation_config = dict(
|
||||
use_inductor_graph_partition=inductor_graph_partition,
|
||||
custom_ops=custom_ops.split(","),
|
||||
pass_config=PassConfig(
|
||||
fuse_norm_quant=True,
|
||||
fuse_act_quant=True,
|
||||
fuse_attn_quant=True,
|
||||
enable_qk_norm_rope_fusion=True,
|
||||
),
|
||||
)
|
||||
|
||||
matches_check = [
|
||||
"rms_quant_fusion",
|
||||
"act_quant_fusion",
|
||||
"norm_rope_fusion",
|
||||
"attn_quant_fusion",
|
||||
]
|
||||
|
||||
run_e2e_fusion_test(
|
||||
model_name,
|
||||
matches,
|
||||
model_kwargs,
|
||||
attn_backend,
|
||||
compilation_config,
|
||||
matches_check,
|
||||
use_deepgemm=use_deepgemm,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, matches_fn, model_kwargs, hf_overrides",
|
||||
[llama3_8b_fp4, llama4_scout_fp4],
|
||||
)
|
||||
@pytest.mark.parametrize("attn_backend", [FLASHINFER_ATTN])
|
||||
@pytest.mark.parametrize("n_layers", [6])
|
||||
@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
|
||||
@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
|
||||
@pytest.mark.skipif(not is_blackwell(), reason="Blackwell required for fp4")
|
||||
def test_tp1_fp4_fusions(
|
||||
model_name: str,
|
||||
matches_fn: Callable[[int], Matches],
|
||||
model_kwargs: dict,
|
||||
hf_overrides: Callable[[int], dict],
|
||||
attn_backend: AttentionBackendCase,
|
||||
n_layers: int,
|
||||
custom_ops: str,
|
||||
inductor_graph_partition: bool,
|
||||
run_e2e_fusion_test,
|
||||
):
|
||||
matches = matches_fn(n_layers)
|
||||
|
||||
# Reduce size of model and skip weight loading time
|
||||
model_kwargs["hf_overrides"] = hf_overrides(n_layers)
|
||||
model_kwargs["load_format"] = "dummy"
|
||||
model_kwargs["max_model_len"] = 1024
|
||||
|
||||
compilation_config = dict(
|
||||
use_inductor_graph_partition=inductor_graph_partition,
|
||||
custom_ops=custom_ops.split(","),
|
||||
pass_config=PassConfig(
|
||||
fuse_norm_quant=True,
|
||||
fuse_act_quant=True,
|
||||
fuse_attn_quant=True,
|
||||
enable_qk_norm_rope_fusion=True,
|
||||
),
|
||||
)
|
||||
|
||||
matches_check = ["act_quant_fusion", "attn_quant_fusion", "norm_rope_fusion"]
|
||||
|
||||
run_e2e_fusion_test(
|
||||
model_name,
|
||||
matches,
|
||||
model_kwargs,
|
||||
attn_backend,
|
||||
compilation_config,
|
||||
matches_check,
|
||||
)
|
||||
199
tests/compile/fusions_e2e/test_tp2_ar_rms.py
Normal file
199
tests/compile/fusions_e2e/test_tp2_ar_rms.py
Normal file
@@ -0,0 +1,199 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from collections.abc import Callable
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import PassConfig
|
||||
|
||||
from ...utils import multi_gpu_test
|
||||
from .common import (
|
||||
INDUCTOR_GRAPH_PARTITION,
|
||||
AttentionBackendCase,
|
||||
Matches,
|
||||
custom_ops_combos,
|
||||
is_blackwell,
|
||||
)
|
||||
from .models import (
|
||||
FLASHINFER_ATTN,
|
||||
TRITON_ATTN,
|
||||
llama3_8b,
|
||||
llama3_8b_fp4,
|
||||
llama3_8b_fp8,
|
||||
llama4_scout_fp4,
|
||||
llama4_scout_fp8,
|
||||
qwen3_a3b,
|
||||
qwen3_a3b_fp8,
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, matches_fn, model_kwargs, hf_overrides",
|
||||
# qwen3-fp8 should still fuse AR+rms even though group quant is not yet supported
|
||||
[llama3_8b_fp8, llama4_scout_fp8, qwen3_a3b_fp8],
|
||||
)
|
||||
@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN])
|
||||
@pytest.mark.parametrize("n_layers", [4])
|
||||
@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm"))
|
||||
@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
|
||||
def test_tp2_ar_rms_fp8_fusions(
|
||||
model_name: str,
|
||||
matches_fn: Callable[[int], Matches],
|
||||
model_kwargs: dict,
|
||||
hf_overrides: Callable[[int], dict],
|
||||
attn_backend: AttentionBackendCase,
|
||||
n_layers: int,
|
||||
custom_ops: str,
|
||||
inductor_graph_partition: bool,
|
||||
run_e2e_fusion_test,
|
||||
monkeypatch,
|
||||
):
|
||||
matches = matches_fn(n_layers)
|
||||
|
||||
if "qwen" in model_name.lower() and "-quant_fp8" in custom_ops:
|
||||
# This is why config forces +quant_fp8 by default
|
||||
pytest.skip("native QuantFP8 matching not supported for group quant")
|
||||
|
||||
# Reduce size of model and skip weight loading time
|
||||
model_kwargs["hf_overrides"] = hf_overrides(n_layers)
|
||||
model_kwargs["load_format"] = "dummy"
|
||||
model_kwargs["max_model_len"] = 1024
|
||||
|
||||
compilation_config = dict(
|
||||
use_inductor_graph_partition=inductor_graph_partition,
|
||||
custom_ops=custom_ops.split(","),
|
||||
pass_config=PassConfig(
|
||||
fuse_norm_quant=True,
|
||||
fuse_act_quant=True,
|
||||
fuse_attn_quant=True,
|
||||
enable_qk_norm_rope_fusion=True,
|
||||
fuse_allreduce_rms=True,
|
||||
),
|
||||
)
|
||||
|
||||
matches_check = [
|
||||
"rms_quant_fusion",
|
||||
"act_quant_fusion",
|
||||
"norm_rope_fusion",
|
||||
"attn_quant_fusion",
|
||||
"ar_rms_fusion",
|
||||
]
|
||||
|
||||
run_e2e_fusion_test(
|
||||
model_name,
|
||||
matches,
|
||||
model_kwargs,
|
||||
attn_backend,
|
||||
compilation_config,
|
||||
matches_check,
|
||||
tp_size=2,
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, matches_fn, model_kwargs, hf_overrides",
|
||||
[llama3_8b_fp4, llama4_scout_fp4],
|
||||
)
|
||||
@pytest.mark.parametrize("attn_backend", [FLASHINFER_ATTN])
|
||||
@pytest.mark.parametrize("n_layers", [4])
|
||||
@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
|
||||
@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
|
||||
@pytest.mark.skipif(not is_blackwell(), reason="Blackwell required for fp4")
|
||||
def test_tp2_ar_rms_fp4_fusions(
|
||||
model_name: str,
|
||||
matches_fn: Callable[[int], Matches],
|
||||
model_kwargs: dict,
|
||||
hf_overrides: Callable[[int], dict],
|
||||
attn_backend: AttentionBackendCase,
|
||||
n_layers: int,
|
||||
custom_ops: str,
|
||||
inductor_graph_partition: bool,
|
||||
run_e2e_fusion_test,
|
||||
monkeypatch,
|
||||
):
|
||||
matches = matches_fn(n_layers)
|
||||
|
||||
# Reduce size of model and skip weight loading time
|
||||
model_kwargs["hf_overrides"] = hf_overrides(n_layers)
|
||||
model_kwargs["load_format"] = "dummy"
|
||||
model_kwargs["max_model_len"] = 1024
|
||||
|
||||
compilation_config = dict(
|
||||
use_inductor_graph_partition=inductor_graph_partition,
|
||||
custom_ops=custom_ops.split(","),
|
||||
pass_config=PassConfig(
|
||||
fuse_act_quant=True,
|
||||
fuse_attn_quant=True,
|
||||
fuse_allreduce_rms=True,
|
||||
),
|
||||
)
|
||||
|
||||
matches_check = [
|
||||
"act_quant_fusion",
|
||||
"attn_quant_fusion",
|
||||
"ar_rms_fusion",
|
||||
]
|
||||
|
||||
run_e2e_fusion_test(
|
||||
model_name,
|
||||
matches,
|
||||
model_kwargs,
|
||||
attn_backend,
|
||||
compilation_config,
|
||||
matches_check,
|
||||
tp_size=2,
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, matches_fn, model_kwargs, hf_overrides",
|
||||
[llama3_8b, qwen3_a3b],
|
||||
)
|
||||
@pytest.mark.parametrize("attn_backend", [TRITON_ATTN])
|
||||
@pytest.mark.parametrize("n_layers", [4])
|
||||
@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
|
||||
@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
|
||||
def test_tp2_ar_rms_fusions(
|
||||
model_name: str,
|
||||
matches_fn: Callable[[int], Matches],
|
||||
model_kwargs: dict,
|
||||
hf_overrides: Callable[[int], dict],
|
||||
attn_backend: AttentionBackendCase,
|
||||
n_layers: int,
|
||||
custom_ops: str,
|
||||
inductor_graph_partition: bool,
|
||||
run_e2e_fusion_test,
|
||||
):
|
||||
matches = matches_fn(n_layers)
|
||||
|
||||
# Reduce size of model and skip weight loading time
|
||||
model_kwargs["hf_overrides"] = hf_overrides(n_layers)
|
||||
model_kwargs["load_format"] = "dummy"
|
||||
model_kwargs["max_model_len"] = 1024
|
||||
|
||||
compilation_config = dict(
|
||||
use_inductor_graph_partition=inductor_graph_partition,
|
||||
custom_ops=custom_ops.split(","),
|
||||
pass_config=PassConfig(
|
||||
enable_qk_norm_rope_fusion=True,
|
||||
fuse_allreduce_rms=True,
|
||||
),
|
||||
)
|
||||
|
||||
matches_check = [
|
||||
"norm_rope_fusion",
|
||||
"ar_rms_fusion",
|
||||
]
|
||||
|
||||
run_e2e_fusion_test(
|
||||
model_name,
|
||||
matches,
|
||||
model_kwargs,
|
||||
attn_backend,
|
||||
compilation_config,
|
||||
matches_check,
|
||||
tp_size=2,
|
||||
)
|
||||
143
tests/compile/fusions_e2e/test_tp2_async_tp.py
Normal file
143
tests/compile/fusions_e2e/test_tp2_async_tp.py
Normal file
@@ -0,0 +1,143 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from collections.abc import Callable
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import PassConfig
|
||||
|
||||
from ...utils import multi_gpu_test
|
||||
from .common import (
|
||||
INDUCTOR_GRAPH_PARTITION,
|
||||
AttentionBackendCase,
|
||||
Matches,
|
||||
custom_ops_combos,
|
||||
is_blackwell,
|
||||
)
|
||||
from .models import (
|
||||
FLASHINFER_ATTN,
|
||||
TRITON_ATTN,
|
||||
llama3_8b,
|
||||
llama3_8b_fp8,
|
||||
llama4_scout_fp8,
|
||||
qwen3_a3b,
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, matches_fn, model_kwargs, hf_overrides",
|
||||
[llama3_8b_fp8, llama4_scout_fp8],
|
||||
)
|
||||
@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN])
|
||||
@pytest.mark.parametrize("n_layers", [4])
|
||||
@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm"))
|
||||
@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
|
||||
def test_tp2_async_tp_fp8_fusions(
|
||||
model_name: str,
|
||||
matches_fn: Callable[[int], Matches],
|
||||
model_kwargs: dict,
|
||||
hf_overrides: Callable[[int], dict],
|
||||
attn_backend: AttentionBackendCase,
|
||||
n_layers: int,
|
||||
custom_ops: str,
|
||||
inductor_graph_partition: bool,
|
||||
run_e2e_fusion_test,
|
||||
monkeypatch,
|
||||
):
|
||||
matches = matches_fn(n_layers)
|
||||
|
||||
if is_blackwell():
|
||||
# Disable FlashInfer scaled_mm FP8 as it's not supported in async tp patterns
|
||||
monkeypatch.setenv("VLLM_DISABLED_KERNELS", "FlashInferFP8ScaledMMLinearKernel")
|
||||
|
||||
# Reduce size of model and skip weight loading time
|
||||
model_kwargs["hf_overrides"] = hf_overrides(n_layers)
|
||||
model_kwargs["load_format"] = "dummy"
|
||||
model_kwargs["max_model_len"] = 1024
|
||||
|
||||
compilation_config = dict(
|
||||
use_inductor_graph_partition=inductor_graph_partition,
|
||||
custom_ops=custom_ops.split(","),
|
||||
pass_config=PassConfig(
|
||||
fuse_norm_quant=True,
|
||||
fuse_act_quant=True,
|
||||
fuse_attn_quant=True,
|
||||
enable_qk_norm_rope_fusion=True,
|
||||
enable_sp=True,
|
||||
fuse_gemm_comms=True,
|
||||
),
|
||||
)
|
||||
|
||||
matches_check = [
|
||||
"rms_quant_fusion",
|
||||
"act_quant_fusion",
|
||||
"norm_rope_fusion",
|
||||
"attn_quant_fusion",
|
||||
"sequence_parallel",
|
||||
"async_tp",
|
||||
]
|
||||
|
||||
run_e2e_fusion_test(
|
||||
model_name,
|
||||
matches,
|
||||
model_kwargs,
|
||||
attn_backend,
|
||||
compilation_config,
|
||||
matches_check,
|
||||
tp_size=2,
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, matches_fn, model_kwargs, hf_overrides",
|
||||
[llama3_8b, qwen3_a3b],
|
||||
)
|
||||
@pytest.mark.parametrize("attn_backend", [TRITON_ATTN])
|
||||
@pytest.mark.parametrize("n_layers", [4])
|
||||
@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
|
||||
@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
|
||||
def test_tp2_async_tp_fusions(
|
||||
model_name: str,
|
||||
matches_fn: Callable[[int], Matches],
|
||||
model_kwargs: dict,
|
||||
hf_overrides: Callable[[int], dict],
|
||||
attn_backend: AttentionBackendCase,
|
||||
n_layers: int,
|
||||
custom_ops: str,
|
||||
inductor_graph_partition: bool,
|
||||
run_e2e_fusion_test,
|
||||
):
|
||||
matches = matches_fn(n_layers)
|
||||
|
||||
# Reduce size of model and skip weight loading time
|
||||
model_kwargs["hf_overrides"] = hf_overrides(n_layers)
|
||||
model_kwargs["load_format"] = "dummy"
|
||||
model_kwargs["max_model_len"] = 1024
|
||||
|
||||
compilation_config = dict(
|
||||
use_inductor_graph_partition=inductor_graph_partition,
|
||||
custom_ops=custom_ops.split(","),
|
||||
pass_config=PassConfig(
|
||||
enable_qk_norm_rope_fusion=True,
|
||||
enable_sp=True,
|
||||
fuse_gemm_comms=True,
|
||||
),
|
||||
)
|
||||
|
||||
matches_check = [
|
||||
"norm_rope_fusion",
|
||||
"sequence_parallel",
|
||||
"async_tp",
|
||||
]
|
||||
|
||||
run_e2e_fusion_test(
|
||||
model_name,
|
||||
matches,
|
||||
model_kwargs,
|
||||
attn_backend,
|
||||
compilation_config,
|
||||
matches_check,
|
||||
tp_size=2,
|
||||
)
|
||||
Reference in New Issue
Block a user