[W8A8 Block Linear Refactor][2/N] Remove W8A8Fp8BlockLinearOp and adopt Fp8 block linear kernel selections. (#33892)

Signed-off-by: maral <maralbahari.98@gmail.com>
Signed-off-by: Maral <maralbahari.98@gmail.com>
This commit is contained in:
Maral
2026-04-09 08:50:39 +08:00
committed by GitHub
parent 8332078cfd
commit 2e9034c998
35 changed files with 1710 additions and 904 deletions

View File

@@ -12,8 +12,8 @@ from tests.kernels.quant_utils import (
native_w8a8_block_matmul,
)
from vllm.config import VllmConfig
from vllm.model_executor.kernels.linear.scaled_mm.cutlass import cutlass_scaled_mm
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
cutlass_scaled_mm,
per_token_group_quant_fp8,
w8a8_triton_block_scaled_mm,
)
@@ -202,7 +202,7 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
# only aligned sizes are supported by deepgemm
if not should_use_deepgemm_for_fp8_linear(
output_dtype=out_dtype, weight=B_fp32, supports_deep_gemm=True
output_dtype=out_dtype, weight_shape=B_fp32.shape, supports_deep_gemm=True
):
pytest.skip(f"Skipping test; invalid size {M}, {N}, {K}")