2026-02-04 19:09:03 -05:00
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
|
from collections.abc import Callable
|
|
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
|
|
|
|
from vllm.config import PassConfig
|
|
|
|
|
|
|
|
|
|
from ...utils import multi_gpu_test
|
|
|
|
|
from .common import (
|
|
|
|
|
INDUCTOR_GRAPH_PARTITION,
|
|
|
|
|
AttentionBackendCase,
|
|
|
|
|
Matches,
|
|
|
|
|
custom_ops_combos,
|
|
|
|
|
is_blackwell,
|
|
|
|
|
)
|
|
|
|
|
from .models import (
|
|
|
|
|
FLASHINFER_ATTN,
|
|
|
|
|
TRITON_ATTN,
|
|
|
|
|
llama3_8b,
|
|
|
|
|
llama3_8b_fp8,
|
|
|
|
|
llama4_scout_fp8,
|
|
|
|
|
qwen3_a3b,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@multi_gpu_test(num_gpus=2)
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"model_name, matches_fn, model_kwargs, hf_overrides",
|
|
|
|
|
[llama3_8b_fp8, llama4_scout_fp8],
|
|
|
|
|
)
|
|
|
|
|
@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN])
|
|
|
|
|
@pytest.mark.parametrize("n_layers", [4])
|
|
|
|
|
@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm"))
|
|
|
|
|
@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
|
|
|
|
|
def test_tp2_async_tp_fp8_fusions(
|
|
|
|
|
model_name: str,
|
|
|
|
|
matches_fn: Callable[[int], Matches],
|
|
|
|
|
model_kwargs: dict,
|
|
|
|
|
hf_overrides: Callable[[int], dict],
|
|
|
|
|
attn_backend: AttentionBackendCase,
|
|
|
|
|
n_layers: int,
|
|
|
|
|
custom_ops: str,
|
|
|
|
|
inductor_graph_partition: bool,
|
|
|
|
|
run_e2e_fusion_test,
|
|
|
|
|
monkeypatch,
|
|
|
|
|
):
|
|
|
|
|
matches = matches_fn(n_layers)
|
|
|
|
|
|
|
|
|
|
if is_blackwell():
|
|
|
|
|
# Disable FlashInfer scaled_mm FP8 as it's not supported in async tp patterns
|
|
|
|
|
monkeypatch.setenv("VLLM_DISABLED_KERNELS", "FlashInferFP8ScaledMMLinearKernel")
|
|
|
|
|
|
|
|
|
|
# Reduce size of model and skip weight loading time
|
|
|
|
|
model_kwargs["hf_overrides"] = hf_overrides(n_layers)
|
|
|
|
|
model_kwargs["load_format"] = "dummy"
|
|
|
|
|
model_kwargs["max_model_len"] = 1024
|
|
|
|
|
|
|
|
|
|
compilation_config = dict(
|
|
|
|
|
use_inductor_graph_partition=inductor_graph_partition,
|
|
|
|
|
custom_ops=custom_ops.split(","),
|
|
|
|
|
pass_config=PassConfig(
|
|
|
|
|
fuse_norm_quant=True,
|
|
|
|
|
fuse_act_quant=True,
|
|
|
|
|
fuse_attn_quant=True,
|
|
|
|
|
enable_qk_norm_rope_fusion=True,
|
|
|
|
|
enable_sp=True,
|
|
|
|
|
fuse_gemm_comms=True,
|
2026-02-25 21:00:12 -08:00
|
|
|
fuse_allreduce_rms=False,
|
|
|
|
|
# Override threshold for testing (models have small hidden_size)
|
|
|
|
|
sp_min_token_num=512,
|
2026-02-04 19:09:03 -05:00
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
matches_check = [
|
|
|
|
|
"rms_quant_fusion",
|
|
|
|
|
"act_quant_fusion",
|
|
|
|
|
"norm_rope_fusion",
|
|
|
|
|
"attn_quant_fusion",
|
|
|
|
|
"sequence_parallel",
|
|
|
|
|
"async_tp",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
run_e2e_fusion_test(
|
|
|
|
|
model_name,
|
|
|
|
|
matches,
|
|
|
|
|
model_kwargs,
|
|
|
|
|
attn_backend,
|
|
|
|
|
compilation_config,
|
|
|
|
|
matches_check,
|
|
|
|
|
tp_size=2,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@multi_gpu_test(num_gpus=2)
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"model_name, matches_fn, model_kwargs, hf_overrides",
|
|
|
|
|
[llama3_8b, qwen3_a3b],
|
|
|
|
|
)
|
|
|
|
|
@pytest.mark.parametrize("attn_backend", [TRITON_ATTN])
|
|
|
|
|
@pytest.mark.parametrize("n_layers", [4])
|
|
|
|
|
@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
|
|
|
|
|
@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
|
|
|
|
|
def test_tp2_async_tp_fusions(
|
|
|
|
|
model_name: str,
|
|
|
|
|
matches_fn: Callable[[int], Matches],
|
|
|
|
|
model_kwargs: dict,
|
|
|
|
|
hf_overrides: Callable[[int], dict],
|
|
|
|
|
attn_backend: AttentionBackendCase,
|
|
|
|
|
n_layers: int,
|
|
|
|
|
custom_ops: str,
|
|
|
|
|
inductor_graph_partition: bool,
|
|
|
|
|
run_e2e_fusion_test,
|
|
|
|
|
):
|
|
|
|
|
matches = matches_fn(n_layers)
|
|
|
|
|
|
|
|
|
|
# Reduce size of model and skip weight loading time
|
|
|
|
|
model_kwargs["hf_overrides"] = hf_overrides(n_layers)
|
|
|
|
|
model_kwargs["load_format"] = "dummy"
|
|
|
|
|
model_kwargs["max_model_len"] = 1024
|
|
|
|
|
|
|
|
|
|
compilation_config = dict(
|
|
|
|
|
use_inductor_graph_partition=inductor_graph_partition,
|
|
|
|
|
custom_ops=custom_ops.split(","),
|
|
|
|
|
pass_config=PassConfig(
|
|
|
|
|
enable_qk_norm_rope_fusion=True,
|
|
|
|
|
enable_sp=True,
|
|
|
|
|
fuse_gemm_comms=True,
|
2026-02-25 21:00:12 -08:00
|
|
|
fuse_allreduce_rms=False,
|
|
|
|
|
# Override threshold for testing (models have small hidden_size)
|
|
|
|
|
sp_min_token_num=512,
|
2026-02-04 19:09:03 -05:00
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
matches_check = [
|
|
|
|
|
"norm_rope_fusion",
|
|
|
|
|
"sequence_parallel",
|
|
|
|
|
"async_tp",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
run_e2e_fusion_test(
|
|
|
|
|
model_name,
|
|
|
|
|
matches,
|
|
|
|
|
model_kwargs,
|
|
|
|
|
attn_backend,
|
|
|
|
|
compilation_config,
|
|
|
|
|
matches_check,
|
|
|
|
|
tp_size=2,
|
|
|
|
|
)
|
2026-02-25 21:00:12 -08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@multi_gpu_test(num_gpus=2)
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"model_name, matches_fn, model_kwargs, hf_overrides",
|
|
|
|
|
[llama3_8b_fp8, llama4_scout_fp8],
|
|
|
|
|
)
|
|
|
|
|
@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN])
|
|
|
|
|
@pytest.mark.parametrize("n_layers", [4])
|
|
|
|
|
@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm"))
|
|
|
|
|
@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
|
|
|
|
|
def test_tp2_sp_ar_rms_fp8_fusions(
|
|
|
|
|
model_name: str,
|
|
|
|
|
matches_fn: Callable[[int], Matches],
|
|
|
|
|
model_kwargs: dict,
|
|
|
|
|
hf_overrides: Callable[[int], dict],
|
|
|
|
|
attn_backend: AttentionBackendCase,
|
|
|
|
|
n_layers: int,
|
|
|
|
|
custom_ops: str,
|
|
|
|
|
inductor_graph_partition: bool,
|
|
|
|
|
run_e2e_fusion_test,
|
|
|
|
|
monkeypatch,
|
|
|
|
|
):
|
|
|
|
|
matches = matches_fn(n_layers)
|
|
|
|
|
|
|
|
|
|
if is_blackwell():
|
|
|
|
|
# Disable FlashInfer scaled_mm FP8 as it's not supported in async tp patterns
|
|
|
|
|
monkeypatch.setenv("VLLM_DISABLED_KERNELS", "FlashInferFP8ScaledMMLinearKernel")
|
|
|
|
|
|
|
|
|
|
# Reduce size of model and skip weight loading time
|
|
|
|
|
model_kwargs["hf_overrides"] = hf_overrides(n_layers)
|
|
|
|
|
model_kwargs["load_format"] = "dummy"
|
|
|
|
|
model_kwargs["max_model_len"] = 1024
|
|
|
|
|
|
|
|
|
|
compilation_config = dict(
|
|
|
|
|
use_inductor_graph_partition=inductor_graph_partition,
|
|
|
|
|
custom_ops=custom_ops.split(","),
|
|
|
|
|
pass_config=PassConfig(
|
|
|
|
|
fuse_norm_quant=True,
|
|
|
|
|
fuse_act_quant=True,
|
|
|
|
|
fuse_attn_quant=True,
|
|
|
|
|
enable_qk_norm_rope_fusion=True,
|
|
|
|
|
enable_sp=True,
|
|
|
|
|
fuse_gemm_comms=True,
|
|
|
|
|
fuse_allreduce_rms=True,
|
|
|
|
|
# Override threshold for testing (models have small hidden_size)
|
|
|
|
|
sp_min_token_num=512,
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
matches_check = [
|
|
|
|
|
"rms_quant_fusion",
|
|
|
|
|
"act_quant_fusion",
|
|
|
|
|
"norm_rope_fusion",
|
|
|
|
|
"attn_quant_fusion",
|
|
|
|
|
"ar_rms_fusion",
|
|
|
|
|
"sequence_parallel",
|
|
|
|
|
"async_tp",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
run_e2e_fusion_test(
|
|
|
|
|
model_name,
|
|
|
|
|
matches,
|
|
|
|
|
model_kwargs,
|
|
|
|
|
attn_backend,
|
|
|
|
|
compilation_config,
|
|
|
|
|
matches_check,
|
|
|
|
|
tp_size=2,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@multi_gpu_test(num_gpus=2)
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"model_name, matches_fn, model_kwargs, hf_overrides",
|
|
|
|
|
[llama3_8b, qwen3_a3b],
|
|
|
|
|
)
|
|
|
|
|
@pytest.mark.parametrize("attn_backend", [TRITON_ATTN])
|
|
|
|
|
@pytest.mark.parametrize("n_layers", [4])
|
|
|
|
|
@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
|
|
|
|
|
@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
|
|
|
|
|
def test_tp2_sp_ar_rms_fusions(
|
|
|
|
|
model_name: str,
|
|
|
|
|
matches_fn: Callable[[int], Matches],
|
|
|
|
|
model_kwargs: dict,
|
|
|
|
|
hf_overrides: Callable[[int], dict],
|
|
|
|
|
attn_backend: AttentionBackendCase,
|
|
|
|
|
n_layers: int,
|
|
|
|
|
custom_ops: str,
|
|
|
|
|
inductor_graph_partition: bool,
|
|
|
|
|
run_e2e_fusion_test,
|
|
|
|
|
):
|
|
|
|
|
matches = matches_fn(n_layers)
|
|
|
|
|
|
|
|
|
|
# Reduce size of model and skip weight loading time
|
|
|
|
|
model_kwargs["hf_overrides"] = hf_overrides(n_layers)
|
|
|
|
|
model_kwargs["load_format"] = "dummy"
|
|
|
|
|
model_kwargs["max_model_len"] = 1024
|
|
|
|
|
|
|
|
|
|
compilation_config = dict(
|
|
|
|
|
use_inductor_graph_partition=inductor_graph_partition,
|
|
|
|
|
custom_ops=custom_ops.split(","),
|
|
|
|
|
pass_config=PassConfig(
|
|
|
|
|
enable_qk_norm_rope_fusion=True,
|
|
|
|
|
enable_sp=True,
|
|
|
|
|
fuse_gemm_comms=True,
|
|
|
|
|
fuse_allreduce_rms=True,
|
|
|
|
|
# Override threshold for testing (models have small hidden_size)
|
|
|
|
|
sp_min_token_num=512,
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
matches_check = [
|
|
|
|
|
"norm_rope_fusion",
|
|
|
|
|
"ar_rms_fusion",
|
|
|
|
|
"sequence_parallel",
|
|
|
|
|
"async_tp",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
run_e2e_fusion_test(
|
|
|
|
|
model_name,
|
|
|
|
|
matches,
|
|
|
|
|
model_kwargs,
|
|
|
|
|
attn_backend,
|
|
|
|
|
compilation_config,
|
|
|
|
|
matches_check,
|
|
|
|
|
tp_size=2,
|
|
|
|
|
)
|