[ModelOpt] Load w13/w2_input_scale for all experts, nvfp4 (#26135)

Signed-off-by: Shu Wang <shuw@nvidia.com> Signed-off-by: Shu Wang. <shuw@nvidia.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-10-21 00:50:31 -05:00
parent aef368aa08
commit f95da13c3d
3 changed files with 58 additions and 9 deletions
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -263,3 +263,9 @@ def get_flashinfer_moe_backend() -> FlashinferMoeBackend:
        f"Unknown flashinfer moe backend: {flashinfer_moe_backend}"
        f" expected one of {allowed_backends}"
    )
+
+
+def is_flashinfer_supporting_global_sf(backend: FlashinferMoeBackend | None) -> bool:
+    # TODO(shuw@nvidia): Update when new backends are added.
+    backends_supporting_global_sf = (FlashinferMoeBackend.CUTLASS,)
+    return backend in backends_supporting_global_sf