Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-05 15:06:22 +01:00
committed by GitHub
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions

View File

@@ -6,24 +6,28 @@ import pytest
import torch
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.config import (
fp8_w8a8_moe_quant_config)
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
apply_flashinfer_per_tensor_scale_fp8, flashinfer_cutlass_moe_fp8,
register_moe_scaling_factors, rotate_flashinfer_fp8_moe_weights,
swap_w13_to_w31)
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
input_to_float8)
apply_flashinfer_per_tensor_scale_fp8,
flashinfer_cutlass_moe_fp8,
register_moe_scaling_factors,
rotate_flashinfer_fp8_moe_weights,
swap_w13_to_w31,
)
from vllm.model_executor.layers.quantization.utils.fp8_utils import input_to_float8
from vllm.model_executor.models.llama4 import Llama4MoE
from vllm.platforms import current_platform
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
if not has_flashinfer_cutlass_fused_moe(
) or not current_platform.has_device_capability(100):
pytest.skip("Requires flashinfer_cutlass_fused_moe and nvfp4 support",
allow_module_level=True)
if not has_flashinfer_cutlass_fused_moe() or not current_platform.has_device_capability(
100
):
pytest.skip(
"Requires flashinfer_cutlass_fused_moe and nvfp4 support",
allow_module_level=True,
)
NUM_EXPERTS = [16]
TOP_KS = [1]
@@ -39,8 +43,7 @@ MNK_FACTORS = [
(1, 4096, 5120),
]
vllm_config = VllmConfig(parallel_config=ParallelConfig(
pipeline_parallel_size=1))
vllm_config = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192
@@ -74,18 +77,17 @@ class TestData:
layer: torch.nn.Module
@staticmethod
def make_moe_tensors_8bit(m: int, k: int, n: int, e: int,
reorder: bool) -> "TestData":
hidden_states = torch.randn(
(m, k), device="cuda", dtype=torch.bfloat16) / 10
def make_moe_tensors_8bit(
m: int, k: int, n: int, e: int, reorder: bool
) -> "TestData":
hidden_states = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
w13 = torch.randn((e, 2 * n, k), device="cuda", dtype=torch.bfloat16)
w2 = torch.randn((e, k, n), device="cuda", dtype=torch.bfloat16)
# Scale to fp8
_, a1_scale = input_to_float8(hidden_states)
a1_scale = 1.0 / a1_scale
a2_scale = torch.scalar_tensor(1.0).to(device="cuda").to(
dtype=torch.float32)
a2_scale = torch.scalar_tensor(1.0).to(device="cuda").to(dtype=torch.float32)
w13_quantized, w13_weight_scale = quant_fp8_per_tensor_batches(w13)
w2_quantized, w2_weight_scale = quant_fp8_per_tensor_batches(w2)
@@ -102,8 +104,7 @@ class TestData:
# flashinfer expects swapped rows for w13
layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data)
if reorder:
rotate_flashinfer_fp8_moe_weights(layer.w13_weight,
layer.w2_weight)
rotate_flashinfer_fp8_moe_weights(layer.w13_weight, layer.w2_weight)
layer.custom_routing_function = Llama4MoE.custom_routing_function
layer.intermediate_size_per_partition = n
layer.ep_rank = 0
@@ -145,7 +146,8 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
top_k=topk,
renormalize=False,
custom_routing_function=Llama4MoE.custom_routing_function,
scoring_func="softmax")
scoring_func="softmax",
)
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=td.w13_weight_scale,
@@ -178,12 +180,10 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
top_k=topk,
num_expert_group=None,
topk_group=None,
apply_router_weight_on_input=True)
apply_router_weight_on_input=True,
)
torch.testing.assert_close(output,
flashinfer_output,
atol=5.5e-2,
rtol=1e-2)
torch.testing.assert_close(output, flashinfer_output, atol=5.5e-2, rtol=1e-2)
@pytest.mark.skip(
@@ -213,7 +213,8 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
top_k=topk,
renormalize=False,
custom_routing_function=Llama4MoE.custom_routing_function,
scoring_func="softmax")
scoring_func="softmax",
)
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=td.w13_weight_scale,
@@ -250,7 +251,6 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
apply_router_weight_on_input=True,
)
torch.testing.assert_close(output,
flashinfer_cutlass_output,
atol=5.5e-2,
rtol=1e-2)
torch.testing.assert_close(
output, flashinfer_cutlass_output, atol=5.5e-2, rtol=1e-2
)