[Bugfix] Use latency MOE backend as default for Flashinfer and other misc fixes (#27439)
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
This commit is contained in:
@@ -168,9 +168,7 @@ def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None:
|
||||
out_ref, scale_ref = ref_nvfp4_quant(x, global_scale)
|
||||
|
||||
out, out_scale = ops.scaled_fp4_quant(x, global_scale)
|
||||
|
||||
scale_ans = recover_swizzled_scales(out_scale, m, n)
|
||||
out_ans = cast_from_fp4(out, m, n)
|
||||
|
||||
torch.testing.assert_close(out_ans, out_ref)
|
||||
torch.testing.assert_close(scale_ans, scale_ref)
|
||||
|
||||
Reference in New Issue
Block a user