[Feat] Refactor for parallel_config in FusedMoEModularKernel (#30282)

Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2025-12-14 23:21:36 -05:00
parent b337647aa0
commit 3778673ea8
8 changed files with 32 additions and 27 deletions
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -247,11 +247,6 @@ def flashinfer_cutlass_moe_fp8(
    assert quant_config is not None

    # Construct modular kernel with block-scale support when requested.
-    parallel_config = getattr(
-        getattr(layer, "vllm_config", None),
-        "parallel_config",
-        None,
-    )
    fused_experts = mk.FusedMoEModularKernel(
        build_flashinfer_fp8_cutlass_moe_prepare_finalize(
            moe=moe, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale
@@ -262,7 +257,7 @@ def flashinfer_cutlass_moe_fp8(
            out_dtype=hidden_states.dtype,
            use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
        ),
-        parallel_config=parallel_config,
+        moe_parallel_config=layer.moe_parallel_config,
    )

    return fused_experts(