[Test] Add Benchmark and Unit Test for per_token_group_quant (#21860)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-30 10:15:02 -04:00
parent e91d3c9cda
commit 0271c2ff2f
2 changed files with 189 additions and 1 deletions
--- a/tests/kernels/quantization/test_per_token_group_quant.py
+++ b/tests/kernels/quantization/test_per_token_group_quant.py
@@ -5,7 +5,7 @@ from unittest.mock import patch
 import pytest
 import torch

-from vllm.model_executor.layers.quantization.utils import fp8_utils
+from vllm.model_executor.layers.quantization.utils import fp8_utils, int8_utils


@pytest.mark.parametrize("shape", [(32, 128), (64, 256), (16, 512)])
@@ -42,3 +42,32 @@ def test_per_token_group_quant_fp8(shape, column_major: bool,

    assert torch.allclose(out_q.float(), ref_q.float(), atol=0.15, rtol=0.15)
    assert torch.allclose(scale, ref_s, atol=0.01, rtol=0.01)
+
+
+@pytest.mark.parametrize("shape", [(32, 128), (64, 256), (16, 512)])
+@pytest.mark.parametrize("group_size", [64, 128])
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_per_token_group_quant_int8(shape, group_size: int):
+    device = "cuda"
+
+    torch.manual_seed(42)
+    num_tokens, hidden_dim = shape
+
+    x = (torch.randn(
+        (num_tokens, hidden_dim), device=device, dtype=torch.bfloat16) * 8)
+
+    # cuda path
+    out_q, scale = int8_utils.per_token_group_quant_int8(
+        x,
+        group_size,
+    )
+
+    # triton ref
+    with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
+        ref_q, ref_s = int8_utils.per_token_group_quant_int8(
+            x,
+            group_size,
+        )
+
+    assert torch.allclose(out_q.float(), ref_q.float(), atol=0.15, rtol=0.15)
+    assert torch.allclose(scale, ref_s, atol=0.01, rtol=0.01)