[Misc/Testing] Use torch.testing.assert_close (#7324)

2024-08-15 21:24:04 -07:00
parent e165528778
commit 50b8d08dbd
25 changed files with 197 additions and 188 deletions
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -69,4 +69,4 @@ def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \
    ref_iscale = one / ref_scale
    ref_out = (as_float32_tensor(x) * ref_iscale).clamp(
        fp8_traits.min, fp8_traits.max).to(dtype=torch.float8_e4m3fn)
-    return ref_out, ref_scale
+    return ref_out, ref_scale.view((1, ))
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -47,7 +47,7 @@ def test_act_and_mul(
    ref_out = layer.forward_native(x)
    # The SiLU and GELU implementations are equivalent to the native PyTorch
    # implementations, so we can do exact comparison.
-    assert torch.allclose(out, ref_out, atol=0.0, rtol=0.0)
+    torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)


@pytest.mark.parametrize("activation", [FastGELU, NewGELU])
@@ -73,7 +73,7 @@ def test_activation(
    layer = activation()
    out = layer(x)
    ref_out = layer.forward_native(x)
-    assert torch.allclose(out,
-                          ref_out,
-                          atol=get_default_atol(out),
-                          rtol=get_default_rtol(out))
+    torch.testing.assert_close(out,
+                               ref_out,
+                               atol=get_default_atol(out),
+                               rtol=get_default_rtol(out))
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -276,7 +276,7 @@ def test_paged_attention(
    atol, rtol = 1e-3, 1e-5
    if kv_cache_dtype == "fp8":
        atol, rtol = 1e-2, 1e-5
-    assert torch.allclose(output, ref_output, atol=atol, rtol=rtol)
+    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)


 def ref_multi_query_kv_attention(
@@ -379,4 +379,4 @@ def test_multi_query_kv_attention(
    )
    atol = get_default_atol(output) if is_hip() else 1e-3
    rtol = get_default_rtol(output) if is_hip() else 1e-5
-    assert torch.allclose(output, ref_output, atol=atol, rtol=rtol)
+    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
--- a/tests/kernels/test_blocksparse_attention.py
+++ b/tests/kernels/test_blocksparse_attention.py
@@ -327,7 +327,7 @@ def test_paged_attention(
    atol, rtol = 1e-3, 1e-5
    if kv_cache_dtype == "fp8":
        atol, rtol = 1e-2, 1e-5
-    assert torch.allclose(output, ref_output, atol=atol, rtol=rtol)
+    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)


 def ref_multi_query_kv_attention(
@@ -441,4 +441,4 @@ def test_varlen_blocksparse_attention_prefill(
        scale,
        dtype,
    )
-    assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2)
+    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2)
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -98,10 +98,10 @@ def test_copy_blocks(

    # Compare the results.
    for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
-        assert torch.allclose(key_cache, cloned_key_cache)
+        torch.testing.assert_close(key_cache, cloned_key_cache)
    for value_cache, cloned_value_cache in zip(value_caches,
                                               cloned_value_caches):
-        assert torch.allclose(value_cache, cloned_value_cache)
+        torch.testing.assert_close(value_cache, cloned_value_cache)


@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -184,17 +184,17 @@ def test_reshape_and_cache(
        cloned_value_cache[block_idx, :, :, block_offset] = value[i]

    if kv_cache_dtype == "fp8":
-        assert torch.allclose(result_key_cache,
-                              cloned_key_cache,
-                              atol=0.001,
-                              rtol=0.1)
-        assert torch.allclose(result_value_cache,
-                              cloned_value_cache,
-                              atol=0.001,
-                              rtol=0.1)
+        torch.testing.assert_close(result_key_cache,
+                                   cloned_key_cache,
+                                   atol=0.001,
+                                   rtol=0.1)
+        torch.testing.assert_close(result_value_cache,
+                                   cloned_value_cache,
+                                   atol=0.001,
+                                   rtol=0.1)
    else:
-        assert torch.allclose(key_cache, cloned_key_cache)
-        assert torch.allclose(value_cache, cloned_value_cache)
+        torch.testing.assert_close(key_cache, cloned_key_cache)
+        torch.testing.assert_close(value_cache, cloned_value_cache)


@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -290,17 +290,17 @@ def test_reshape_and_cache_flash(
        cloned_value_cache[block_idx, block_offset, :, :] = value[i]

    if kv_cache_dtype == "fp8":
-        assert torch.allclose(result_key_cache,
-                              cloned_key_cache,
-                              atol=0.001,
-                              rtol=0.1)
-        assert torch.allclose(result_value_cache,
-                              cloned_value_cache,
-                              atol=0.001,
-                              rtol=0.1)
+        torch.testing.assert_close(result_key_cache,
+                                   cloned_key_cache,
+                                   atol=0.001,
+                                   rtol=0.1)
+        torch.testing.assert_close(result_value_cache,
+                                   cloned_value_cache,
+                                   atol=0.001,
+                                   rtol=0.1)
    else:
-        assert torch.allclose(key_cache, cloned_key_cache)
-        assert torch.allclose(value_cache, cloned_value_cache)
+        torch.testing.assert_close(key_cache, cloned_key_cache)
+        torch.testing.assert_close(value_cache, cloned_value_cache)


@pytest.mark.parametrize("direction", COPYING_DIRECTION)
@@ -372,10 +372,10 @@ def test_swap_blocks(
                    block_mapping_tensor)

    for src, dst in block_mapping:
-        assert torch.allclose(src_key_caches_clone[src].cpu(),
-                              dist_key_caches[0][dst].cpu())
-        assert torch.allclose(src_value_caches_clone[src].cpu(),
-                              dist_value_caches[0][dst].cpu())
+        torch.testing.assert_close(src_key_caches_clone[src].cpu(),
+                                   dist_key_caches[0][dst].cpu())
+        torch.testing.assert_close(src_value_caches_clone[src].cpu(),
+                                   dist_value_caches[0][dst].cpu())


@pytest.mark.parametrize("num_heads", NUM_HEADS)
@@ -411,4 +411,4 @@ def test_fp8_e4m3_conversion(
    converted_cache = torch.empty_like(cache)
    ops.convert_fp8(converted_cache, cache_fp8)

-    assert torch.allclose(cache, converted_cache, atol=0.001, rtol=0.1)
+    torch.testing.assert_close(cache, converted_cache, atol=0.001, rtol=0.1)
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -74,7 +74,7 @@ def cutlass_fp8_gemm_helper(m: int,
    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
    baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)

-    assert torch.allclose(out, baseline, rtol=1e-2, atol=5e-2)
+    torch.testing.assert_close(out, baseline, rtol=1e-2, atol=5e-2)


 def cutlass_int8_gemm_helper(m: int,
@@ -106,7 +106,7 @@ def cutlass_int8_gemm_helper(m: int,
    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
    baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)

-    assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0)
+    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)


@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 100, 33])
@@ -252,7 +252,7 @@ def test_cutlass_int8_azp_bias_fold(m: int, n: int, k: int,
    azp_a = azp_aq_i8.to(dtype=torch.float32) * scale_a  # correct for rounding

    a_dq = scale_a * (aq_i32 + azp_aq_i8).to(dtype=torch.float32)
-    assert torch.allclose(a_dq, scale_a * aq_f32 + azp_a)
+    torch.testing.assert_close(a_dq, scale_a * aq_f32 + azp_a)

    baseline_dq = torch.mm(a_dq, b_dq).to(out_dtype)

@@ -271,8 +271,8 @@ def test_cutlass_int8_azp_bias_fold(m: int, n: int, k: int,
                                scale_b,
                                out_dtype=out_dtype,
                                bias=azp_bias[0, :])
-    assert torch.allclose(out, baseline_dq, rtol=1e-2, atol=1e0)
-    assert torch.allclose(out, baseline_q, rtol=1e-2, atol=1e0)
+    torch.testing.assert_close(out, baseline_dq, rtol=1e-2, atol=1e0)
+    torch.testing.assert_close(out, baseline_q, rtol=1e-2, atol=1e0)


@pytest.mark.parametrize("m", [32, 64, 128])
@@ -302,7 +302,10 @@ def test_cutlass_int8_azp(m: int, n: int, k: int, out_dtype: torch.dtype,
    azp_a = azp_aq_i8.to(dtype=torch.float32) * scale_a  # correct for rounding

    a_dq = scale_a * (aq_i32 - azp_aq_i8).to(dtype=torch.float32)
-    assert torch.allclose(a_dq, scale_a * aq_f32 - azp_a, rtol=1e-4, atol=1e-3)
+    torch.testing.assert_close(a_dq,
+                               scale_a * aq_f32 - azp_a,
+                               rtol=1e-4,
+                               atol=1e-3)

    if use_bias:
        bias = torch.rand((1, n), device="cuda", dtype=out_dtype) * 10 + 2.5
@@ -335,8 +338,8 @@ def test_cutlass_int8_azp(m: int, n: int, k: int, out_dtype: torch.dtype,
    # float16 precision is 10-bit mantissa -> 2^-11 ~ 0.05%
    rtol = 1e-2 if out_dtype == torch.bfloat16 else 1e-3
    atol = 1e-3
-    assert torch.allclose(out, baseline_dq, rtol=rtol, atol=atol)
-    assert torch.allclose(out, baseline_q, rtol=rtol, atol=atol)
+    torch.testing.assert_close(out, baseline_dq, rtol=rtol, atol=atol)
+    torch.testing.assert_close(out, baseline_q, rtol=rtol, atol=atol)


 # Test working with a subset of A and B
@@ -363,7 +366,7 @@ def test_cutlass_subset():
                                  scale_b,
                                  out_dtype=torch.bfloat16)

-    assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0)
+    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)


 # Test to make sure cuda graphs work
@@ -411,4 +414,4 @@ def test_cutlass_cuda_graph(per_act_token: bool, per_out_ch: bool):

    baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
                        scale_b * b.to(dtype=torch.float32)).to(torch.bfloat16)
-    assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0)
+    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
--- a/tests/kernels/test_flash_attn.py
+++ b/tests/kernels/test_flash_attn.py
@@ -126,7 +126,7 @@ def test_flash_attn_with_paged_kv(
        scale=scale,
        soft_cap=soft_cap,
    )
-    assert torch.allclose(output, ref_output, atol=2e-2, rtol=1e-2), \
+    torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2), \
        f"{torch.max(torch.abs(output - ref_output))}"


@@ -211,5 +211,5 @@ def test_varlen_with_paged_kv(
        sliding_window=sliding_window,
        soft_cap=soft_cap,
    )
-    assert torch.allclose(output, ref_output, atol=2e-2, rtol=1e-2), \
+    torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2), \
        f"{torch.max(torch.abs(output - ref_output))}"
--- a/tests/kernels/test_flashinfer.py
+++ b/tests/kernels/test_flashinfer.py
@@ -144,7 +144,7 @@ def test_flashinfer_decode_with_paged_kv(kv_lens: List[int],
                                block_tables=block_tables,
                                scale=scale,
                                soft_cap=soft_cap)
-    assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2), \
+    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \
        f"{torch.max(torch.abs(output - ref_output))}"


@@ -244,5 +244,5 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
                                block_tables=block_tables,
                                scale=scale,
                                soft_cap=soft_cap)
-    assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2), \
+    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \
        f"{torch.max(torch.abs(output - ref_output))}"
--- a/tests/kernels/test_fp8_quant.py
+++ b/tests/kernels/test_fp8_quant.py
@@ -37,9 +37,9 @@ def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int,
                                               scale_ub=scale_ub,
                                               use_per_token_if_dynamic=True)

-    assert torch.allclose(ref_scales, ops_scales)
-    assert torch.allclose(ref_out.to(dtype=torch.float32),
-                          ops_out.to(dtype=torch.float32))
+    torch.testing.assert_close(ref_scales, ops_scales)
+    torch.testing.assert_close(ref_out.to(dtype=torch.float32),
+                               ops_out.to(dtype=torch.float32))


@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -57,9 +57,9 @@ def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int,
    ref_out, ref_scale = ref_dynamic_per_tensor_fp8_quant(x)
    ops_out, ops_scale = ops.scaled_fp8_quant(x)

-    assert torch.allclose(ref_scale, ops_scale)
-    assert torch.allclose(ref_out.to(dtype=torch.float32),
-                          ops_out.to(dtype=torch.float32))
+    torch.testing.assert_close(ref_scale, ops_scale)
+    torch.testing.assert_close(ref_out.to(dtype=torch.float32),
+                               ops_out.to(dtype=torch.float32))


 # Regression test for a case with large activations where an int32 index cannot
@@ -84,4 +84,4 @@ def test_fp8_quant_large(seed: int) -> None:
    ref_out = ref_out.to(dtype=dtype)
    ops_out = ops_out.to(dtype=dtype)

-    assert torch.allclose(ref_out, ops_out)
+    torch.testing.assert_close(ref_out, ops_out)
--- a/tests/kernels/test_int8_quant.py
+++ b/tests/kernels/test_int8_quant.py
@@ -29,9 +29,10 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
    # kernel
    ops_out, ops_scales = scaled_int8_quant(x)

-    assert torch.allclose(ops_scales, ref_scales)
-    assert torch.allclose(ops_out, ref_out,
-                          atol=1)  # big atol to account for rounding errors
+    torch.testing.assert_close(ops_scales, ref_scales)
+    torch.testing.assert_close(
+        ops_out, ref_out, atol=1,
+        rtol=0.0)  # big atol to account for rounding errors


@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -54,5 +55,6 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
                                     int8_traits.max).to(torch.int8)
    out2, _ = scaled_int8_quant(x, scale)

-    assert torch.allclose(out1, out2,
-                          atol=1)  # big atol to account for rounding errors
+    torch.testing.assert_close(
+        out1, out2, atol=1,
+        rtol=0.0)  # big atol to account for rounding errors
--- a/tests/kernels/test_layernorm.py
+++ b/tests/kernels/test_layernorm.py
@@ -48,7 +48,7 @@ def test_rms_norm(
    # numerical errors than other operators because they involve reductions.
    # Therefore, we use a larger tolerance.
    if add_residual:
-        assert torch.allclose(out[0], ref_out[0], atol=1e-2, rtol=1e-2)
-        assert torch.allclose(out[1], ref_out[1], atol=1e-2, rtol=1e-2)
+        torch.testing.assert_close(out[0], ref_out[0], atol=1e-2, rtol=1e-2)
+        torch.testing.assert_close(out[1], ref_out[1], atol=1e-2, rtol=1e-2)
    else:
-        assert torch.allclose(out, ref_out, atol=1e-2, rtol=1e-2)
+        torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -122,7 +122,7 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
    )
    torch.cuda.synchronize()

-    assert torch.allclose(marlin_q_w_1, marlin_q_w_2)
+    torch.testing.assert_close(marlin_q_w_1, marlin_q_w_2)


@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
@@ -174,7 +174,7 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
    )
    torch.cuda.synchronize()

-    assert torch.allclose(marlin_q_w_1, marlin_q_w_2)
+    torch.testing.assert_close(marlin_q_w_1, marlin_q_w_2)


@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -50,7 +50,7 @@ def test_fused_moe(
    score = torch.randn((m, e), device='cuda', dtype=dtype)
    triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False)
    torch_output = torch_moe(a, w1, w2, score, topk)
-    assert torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0)
+    torch.testing.assert_close(triton_output, torch_output, atol=1e-2, rtol=0)


@pytest.mark.parametrize("dtype",
@@ -95,7 +95,7 @@ def test_mixtral_moe(dtype: torch.dtype):
        torch.bfloat16: 1e-2,
    }

-    assert torch.allclose(hf_states.flatten(0, 1),
-                          vllm_states,
-                          rtol=mixtral_moe_tol[dtype],
-                          atol=mixtral_moe_tol[dtype])
+    torch.testing.assert_close(hf_states.flatten(0, 1),
+                               vllm_states,
+                               rtol=mixtral_moe_tol[dtype],
+                               atol=mixtral_moe_tol[dtype])
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -67,14 +67,14 @@ def test_rotary_embedding(
    ref_query, ref_key = rope.forward_native(positions, query, key)
    out_query, out_key = rope.forward(positions, query, key)
    # Compare the results.
-    assert torch.allclose(out_query,
-                          ref_query,
-                          atol=get_default_atol(out_query),
-                          rtol=get_default_rtol(out_query))
-    assert torch.allclose(out_key,
-                          ref_key,
-                          atol=get_default_atol(out_key),
-                          rtol=get_default_rtol(out_key))
+    torch.testing.assert_close(out_query,
+                               ref_query,
+                               atol=get_default_atol(out_query),
+                               rtol=get_default_rtol(out_query))
+    torch.testing.assert_close(out_key,
+                               ref_key,
+                               atol=get_default_atol(out_key),
+                               rtol=get_default_rtol(out_key))


@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
@@ -129,14 +129,14 @@ def test_batched_rotary_embedding(
                                                          dtype=torch.long,
                                                          device=device))
    # Compare the results.
-    assert torch.allclose(out_query,
-                          ref_query,
-                          atol=get_default_atol(out_query),
-                          rtol=get_default_rtol(out_query))
-    assert torch.allclose(out_key,
-                          ref_key,
-                          atol=get_default_atol(out_key),
-                          rtol=get_default_rtol(out_key))
+    torch.testing.assert_close(out_query,
+                               ref_query,
+                               atol=get_default_atol(out_query),
+                               rtol=get_default_rtol(out_query))
+    torch.testing.assert_close(out_key,
+                               ref_key,
+                               atol=get_default_atol(out_key),
+                               rtol=get_default_rtol(out_key))


@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
@@ -200,14 +200,14 @@ def test_batched_rotary_embedding_multi_lora(
    out_query, out_key = rope.forward(positions, query, key,
                                      query_offsets.flatten())
    # Compare the results.
-    assert torch.allclose(out_query,
-                          ref_query,
-                          atol=get_default_atol(out_query),
-                          rtol=get_default_rtol(out_query))
-    assert torch.allclose(out_key,
-                          ref_key,
-                          atol=get_default_atol(out_key),
-                          rtol=get_default_rtol(out_key))
+    torch.testing.assert_close(out_query,
+                               ref_query,
+                               atol=get_default_atol(out_query),
+                               rtol=get_default_rtol(out_query))
+    torch.testing.assert_close(out_key,
+                               ref_key,
+                               atol=get_default_atol(out_key),
+                               rtol=get_default_rtol(out_key))


@torch.inference_mode()
--- a/tests/kernels/test_sampler.py
+++ b/tests/kernels/test_sampler.py
@@ -100,11 +100,11 @@ def test_sample_decoding_only(random_sampling, max_best_of,
        if modify_greedy_probs and not request_uses_random_sampling:
            # If we are modifying greedy probs and the request is greedy,
            # we want to make sure the probs tensor is modified in place
-            assert torch.allclose(
+            torch.testing.assert_close(
                probs[i][sampled_tokens[i]],
                torch.full_like(probs[i][sampled_tokens[i]], 1.0))
            assert torch.sum(probs[i]) == 1.0
-            assert torch.allclose(
+            torch.testing.assert_close(
                sampled_modified_probs[i][0],
                torch.full_like(sampled_modified_probs[i][0], 1.0))
        elif request_uses_random_sampling:
@@ -117,8 +117,8 @@ def test_sample_decoding_only(random_sampling, max_best_of,
            # If the request is greedy and we are not modifying greedy probs,
            # we want to make sure sampled_modified_probs tensor is the same as
            # the probs tensor.
-            assert torch.allclose(sampled_modified_probs[i][0],
-                                  probs[i][sampled_tokens[i]])
+            torch.testing.assert_close(sampled_modified_probs[i],
+                                       probs[i][sampled_tokens[i]])

    if save_logprobs:
        assert sampled_logprobs.shape == (bs, max_best_of)
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -924,5 +924,5 @@ def assert_actual_matches_ideal(test_params: PhaseTestParameters,
    * output_under_test: actually observed output value
    '''
    ideal_output = test_params.packed_qkvo.ideal_output
-    assert torch.allclose(ideal_output,
-                          output_under_test.view_as(ideal_output))
+    torch.testing.assert_close(ideal_output,
+                               output_under_test.view_as(ideal_output))