[Test] Add Benchmark and Unit Test for per_token_group_quant (#21860)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
@@ -5,7 +5,7 @@ from unittest.mock import patch
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.quantization.utils import fp8_utils
|
||||
from vllm.model_executor.layers.quantization.utils import fp8_utils, int8_utils
|
||||
|
||||
|
||||
@pytest.mark.parametrize("shape", [(32, 128), (64, 256), (16, 512)])
|
||||
@@ -42,3 +42,32 @@ def test_per_token_group_quant_fp8(shape, column_major: bool,
|
||||
|
||||
assert torch.allclose(out_q.float(), ref_q.float(), atol=0.15, rtol=0.15)
|
||||
assert torch.allclose(scale, ref_s, atol=0.01, rtol=0.01)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("shape", [(32, 128), (64, 256), (16, 512)])
|
||||
@pytest.mark.parametrize("group_size", [64, 128])
|
||||
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
|
||||
def test_per_token_group_quant_int8(shape, group_size: int):
|
||||
device = "cuda"
|
||||
|
||||
torch.manual_seed(42)
|
||||
num_tokens, hidden_dim = shape
|
||||
|
||||
x = (torch.randn(
|
||||
(num_tokens, hidden_dim), device=device, dtype=torch.bfloat16) * 8)
|
||||
|
||||
# cuda path
|
||||
out_q, scale = int8_utils.per_token_group_quant_int8(
|
||||
x,
|
||||
group_size,
|
||||
)
|
||||
|
||||
# triton ref
|
||||
with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
|
||||
ref_q, ref_s = int8_utils.per_token_group_quant_int8(
|
||||
x,
|
||||
group_size,
|
||||
)
|
||||
|
||||
assert torch.allclose(out_q.float(), ref_q.float(), atol=0.15, rtol=0.15)
|
||||
assert torch.allclose(scale, ref_s, atol=0.01, rtol=0.01)
|
||||
|
||||
Reference in New Issue
Block a user