[ModelOpt] Load w13/w2_input_scale for all experts, nvfp4 (#26135)

Signed-off-by: Shu Wang <shuw@nvidia.com>
Signed-off-by: Shu Wang. <shuw@nvidia.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
This commit is contained in:
Shu Wang
2025-10-21 00:50:31 -05:00
committed by GitHub
parent aef368aa08
commit f95da13c3d
3 changed files with 58 additions and 9 deletions

View File

@@ -263,3 +263,9 @@ def get_flashinfer_moe_backend() -> FlashinferMoeBackend:
f"Unknown flashinfer moe backend: {flashinfer_moe_backend}"
f" expected one of {allowed_backends}"
)
def is_flashinfer_supporting_global_sf(backend: FlashinferMoeBackend | None) -> bool:
# TODO(shuw@nvidia): Update when new backends are added.
backends_supporting_global_sf = (FlashinferMoeBackend.CUTLASS,)
return backend in backends_supporting_global_sf