[Bugfix] Use latency MOE backend as default for Flashinfer and other misc fixes (#27439)

Signed-off-by: Pavani Majety <pmajety@nvidia.com>
2025-11-07 04:18:39 -08:00
parent e0919f331d
commit 72b1c2ae2c
7 changed files with 47 additions and 12 deletions
--- a/tests/kernels/quantization/test_nvfp4_quant.py
+++ b/tests/kernels/quantization/test_nvfp4_quant.py
@@ -168,9 +168,7 @@ def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None:
    out_ref, scale_ref = ref_nvfp4_quant(x, global_scale)

    out, out_scale = ops.scaled_fp4_quant(x, global_scale)
-
    scale_ans = recover_swizzled_scales(out_scale, m, n)
    out_ans = cast_from_fp4(out, m, n)
-
    torch.testing.assert_close(out_ans, out_ref)
    torch.testing.assert_close(scale_ans, scale_ref)