[Feature][Hardware][Amd] Add fp8 Linear Layer for Rocm (#7210)

This commit is contained in:
Charlie Fu
2024-08-16 12:06:30 -05:00
committed by GitHub
parent ec724a725e
commit e837b624f2
7 changed files with 164 additions and 49 deletions

View File

@@ -2,7 +2,8 @@ import pytest
import torch
import vllm._custom_ops as ops
from tests.kernels.quant_utils import (ref_dynamic_per_tensor_fp8_quant,
from tests.kernels.quant_utils import (FP8_DTYPE,
ref_dynamic_per_tensor_fp8_quant,
ref_dynamic_per_token_quant)
DTYPES = [torch.half, torch.bfloat16, torch.float]
@@ -31,8 +32,7 @@ def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int,
scale_ub = torch.mean(x).to(dtype=torch.float32, device='cuda') \
if scale_ub else None
ref_out, ref_scales = ref_dynamic_per_token_quant(x, torch.float8_e4m3fn,
scale_ub)
ref_out, ref_scales = ref_dynamic_per_token_quant(x, FP8_DTYPE, scale_ub)
ops_out, ops_scales = ops.scaled_fp8_quant(x,
scale_ub=scale_ub,
use_per_token_if_dynamic=True)