Convert formatting to use ruff instead of yapf + isort (#26247)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -7,14 +7,18 @@ from typing import Optional
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.kernels.moe.utils import (batched_moe,
|
||||
make_quantized_test_activations,
|
||||
make_test_weights, naive_batched_moe)
|
||||
from tests.kernels.moe.utils import (
|
||||
batched_moe,
|
||||
make_quantized_test_activations,
|
||||
make_test_weights,
|
||||
naive_batched_moe,
|
||||
)
|
||||
from tests.kernels.quant_utils import native_batched_masked_quant_matmul
|
||||
from tests.kernels.utils import torch_experts
|
||||
from vllm.config import VllmConfig, set_current_vllm_config
|
||||
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
|
||||
invoke_moe_batched_triton_kernel)
|
||||
invoke_moe_batched_triton_kernel,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.triton_utils import tl
|
||||
@@ -68,23 +72,32 @@ class BatchedMMTensors:
|
||||
|
||||
@staticmethod
|
||||
def make_tensors(config: BatchedMMConfig):
|
||||
A = torch.randn(
|
||||
(config.num_experts, config.max_tokens_per_expert, config.K),
|
||||
A = (
|
||||
torch.randn(
|
||||
(config.num_experts, config.max_tokens_per_expert, config.K),
|
||||
device="cuda",
|
||||
dtype=config.in_dtype,
|
||||
)
|
||||
/ 10
|
||||
)
|
||||
B = torch.randn(
|
||||
(config.num_experts, config.N, config.K),
|
||||
device="cuda",
|
||||
dtype=config.in_dtype) / 10
|
||||
B = torch.randn((config.num_experts, config.N, config.K),
|
||||
device="cuda",
|
||||
dtype=config.in_dtype)
|
||||
dtype=config.in_dtype,
|
||||
)
|
||||
C = torch.zeros(
|
||||
(config.num_experts, config.max_tokens_per_expert, config.N),
|
||||
device="cuda",
|
||||
dtype=config.out_dtype)
|
||||
dtype=config.out_dtype,
|
||||
)
|
||||
|
||||
num_expert_tokens = torch.randint(low=0,
|
||||
high=config.max_tokens_per_expert,
|
||||
size=(config.num_experts, ),
|
||||
device="cuda",
|
||||
dtype=torch.int32)
|
||||
num_expert_tokens = torch.randint(
|
||||
low=0,
|
||||
high=config.max_tokens_per_expert,
|
||||
size=(config.num_experts,),
|
||||
device="cuda",
|
||||
dtype=torch.int32,
|
||||
)
|
||||
|
||||
return BatchedMMTensors(A, B, C, num_expert_tokens)
|
||||
|
||||
@@ -96,10 +109,15 @@ class BatchedMMTensors:
|
||||
@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
|
||||
@pytest.mark.parametrize("block_shape", [None, [128, 128]])
|
||||
@pytest.mark.parametrize("per_act_token_quant", [False, True])
|
||||
def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
|
||||
N: int, dtype: torch.dtype,
|
||||
block_shape: Optional[list[int]],
|
||||
per_act_token_quant: bool):
|
||||
def test_batched_mm(
|
||||
num_experts: int,
|
||||
max_tokens_per_expert: int,
|
||||
K: int,
|
||||
N: int,
|
||||
dtype: torch.dtype,
|
||||
block_shape: Optional[list[int]],
|
||||
per_act_token_quant: bool,
|
||||
):
|
||||
current_platform.seed_everything(7)
|
||||
|
||||
use_fp8_w8a8 = dtype == torch.float8_e4m3fn
|
||||
@@ -117,11 +135,13 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
|
||||
act_dtype = dtype
|
||||
quant_dtype = None
|
||||
|
||||
num_expert_tokens = torch.randint(low=0,
|
||||
high=max_tokens_per_expert,
|
||||
size=(num_experts, ),
|
||||
device="cuda",
|
||||
dtype=torch.int32)
|
||||
num_expert_tokens = torch.randint(
|
||||
low=0,
|
||||
high=max_tokens_per_expert,
|
||||
size=(num_experts,),
|
||||
device="cuda",
|
||||
dtype=torch.int32,
|
||||
)
|
||||
|
||||
A, A_q, A_scale = make_quantized_test_activations(
|
||||
num_experts,
|
||||
@@ -151,7 +171,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
|
||||
compute_tl_dtype = {
|
||||
torch.float16: tl.float16,
|
||||
torch.bfloat16: tl.bfloat16,
|
||||
torch.float32: tl.float32
|
||||
torch.float32: tl.float32,
|
||||
}[test_output.dtype]
|
||||
|
||||
assert A_q.dtype == B_q.dtype
|
||||
@@ -173,7 +193,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
|
||||
config={
|
||||
"BLOCK_SIZE_M": 16,
|
||||
"BLOCK_SIZE_N": 16,
|
||||
"BLOCK_SIZE_K": 16 if dtype.itemsize > 1 else 32
|
||||
"BLOCK_SIZE_K": 16 if dtype.itemsize > 1 else 32,
|
||||
},
|
||||
per_act_token_quant=per_act_token_quant,
|
||||
block_shape=block_shape,
|
||||
@@ -186,11 +206,16 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
|
||||
num_expert_tokens,
|
||||
)
|
||||
|
||||
q_ref_output = native_batched_masked_quant_matmul(A_q, B_q, q_ref_output,
|
||||
num_expert_tokens,
|
||||
A_scale, B_scale,
|
||||
block_shape,
|
||||
per_act_token_quant)
|
||||
q_ref_output = native_batched_masked_quant_matmul(
|
||||
A_q,
|
||||
B_q,
|
||||
q_ref_output,
|
||||
num_expert_tokens,
|
||||
A_scale,
|
||||
B_scale,
|
||||
block_shape,
|
||||
per_act_token_quant,
|
||||
)
|
||||
|
||||
rtol, atol = {
|
||||
torch.float16: (6e-2, 6e-2),
|
||||
@@ -308,12 +333,6 @@ def test_fused_moe_batched_experts(
|
||||
block_shape=block_shape,
|
||||
)
|
||||
|
||||
torch.testing.assert_close(batched_output,
|
||||
baseline_output,
|
||||
atol=3e-2,
|
||||
rtol=2e-2)
|
||||
torch.testing.assert_close(batched_output, baseline_output, atol=3e-2, rtol=2e-2)
|
||||
|
||||
torch.testing.assert_close(triton_output,
|
||||
batched_output,
|
||||
atol=2e-2,
|
||||
rtol=2e-2)
|
||||
torch.testing.assert_close(triton_output, batched_output, atol=2e-2, rtol=2e-2)
|
||||
|
||||
Reference in New Issue
Block a user