[Kernel] Delegate construction of FusedMoEQuantConfig to FusedMoEMethodBase subclasses (#22537)

Signed-off-by: Bill Nell <bnell@redhat.com>
This commit is contained in:
bnellnm
2025-09-17 19:43:31 -04:00
committed by GitHub
parent e6585ddb45
commit 5963b98b46
68 changed files with 2698 additions and 2526 deletions

View File

@@ -6,6 +6,8 @@ import torch
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
BatchedDeepGemmExperts)
from vllm.model_executor.layers.fused_moe.config import (
fp8_w8a8_moe_quant_config)
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
BatchedPrepareAndFinalize, BatchedTritonExperts)
from vllm.model_executor.layers.fused_moe.modular_kernel import (
@@ -56,13 +58,18 @@ def test_batched_deepgemm_vs_triton(E: int, T: int, K: int, N: int, topk: int,
rank=0,
)
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_s,
w2_scale=w2_s,
per_act_token_quant=False,
block_shape=BLOCK_SIZE,
)
# triton (reference)
triton_experts = BatchedTritonExperts(
max_num_tokens=max_num_tokens,
num_dispatchers=1,
use_fp8_w8a8=True,
per_act_token_quant=False,
block_shape=BLOCK_SIZE,
quant_config=quant_config,
)
mk_triton = FusedMoEModularKernel(prep_finalize, triton_experts)
@@ -73,8 +80,6 @@ def test_batched_deepgemm_vs_triton(E: int, T: int, K: int, N: int, topk: int,
topk_weights=topk_weights,
topk_ids=topk_ids,
inplace=False,
w1_scale=w1_s,
w2_scale=w2_s,
global_num_experts=E,
)
@@ -82,8 +87,7 @@ def test_batched_deepgemm_vs_triton(E: int, T: int, K: int, N: int, topk: int,
deepgemm_experts = BatchedDeepGemmExperts(
max_num_tokens=max_num_tokens,
num_dispatchers=1,
block_shape=BLOCK_SIZE,
per_act_token_quant=False,
quant_config=quant_config,
)
mk_deepgemm = FusedMoEModularKernel(prep_finalize, deepgemm_experts)
@@ -94,8 +98,6 @@ def test_batched_deepgemm_vs_triton(E: int, T: int, K: int, N: int, topk: int,
topk_weights=topk_weights,
topk_ids=topk_ids,
inplace=False,
w1_scale=w1_s,
w2_scale=w2_s,
global_num_experts=E,
)