[Bugfix] Use latency MOE backend as default for Flashinfer and other misc fixes (#27439)

Signed-off-by: Pavani Majety <pmajety@nvidia.com>
This commit is contained in:
Pavani Majety
2025-11-07 04:18:39 -08:00
committed by GitHub
parent e0919f331d
commit 72b1c2ae2c
7 changed files with 47 additions and 12 deletions

View File

@@ -168,9 +168,7 @@ def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None:
out_ref, scale_ref = ref_nvfp4_quant(x, global_scale)
out, out_scale = ops.scaled_fp4_quant(x, global_scale)
scale_ans = recover_swizzled_scales(out_scale, m, n)
out_ans = cast_from_fp4(out, m, n)
torch.testing.assert_close(out_ans, out_ref)
torch.testing.assert_close(scale_ans, scale_ref)