[W8A8 Block Linear Refactor][2/N] Remove W8A8Fp8BlockLinearOp and adopt Fp8 block linear kernel selections. (#33892)
Signed-off-by: maral <maralbahari.98@gmail.com> Signed-off-by: Maral <maralbahari.98@gmail.com>
This commit is contained in:
@@ -16,6 +16,9 @@ from compressed_tensors.quantization import (
|
||||
)
|
||||
|
||||
from tests.models.utils import check_logprobs_close
|
||||
from vllm.model_executor.kernels.linear import (
|
||||
Fp8BlockScaledMMLinearKernel,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe import UnquantizedFusedMoEMethod
|
||||
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501
|
||||
CompressedTensorsConfig,
|
||||
@@ -29,7 +32,6 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
|
||||
CompressedTensorsWNA16,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import W8A8BlockFp8LinearOp
|
||||
from vllm.model_executor.layers.quantization.utils.nvfp4_utils import (
|
||||
cutlass_fp4_supported,
|
||||
)
|
||||
@@ -473,16 +475,14 @@ def test_compressed_tensors_fp8_block_enabled(vllm_runner):
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
|
||||
assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8)
|
||||
assert isinstance(
|
||||
qkv_proj.scheme.w8a8_block_fp8_linear, W8A8BlockFp8LinearOp
|
||||
)
|
||||
assert isinstance(qkv_proj.scheme.fp8_linear, Fp8BlockScaledMMLinearKernel)
|
||||
|
||||
assert qkv_proj.weight.dtype is fp8_dtype
|
||||
assert qkv_proj.weight_scale.dtype is torch.float32
|
||||
assert len(qkv_proj.weight.shape) == 2
|
||||
assert len(qkv_proj.weight_scale.shape) == 2
|
||||
|
||||
input_quant_op = qkv_proj.scheme.w8a8_block_fp8_linear.input_quant_op
|
||||
input_quant_op = qkv_proj.scheme.fp8_linear.quant_fp8
|
||||
assert isinstance(input_quant_op, QuantFP8)
|
||||
assert input_quant_op._forward_method in (
|
||||
input_quant_op.forward_cuda,
|
||||
|
||||
Reference in New Issue
Block a user