[W8A8 Block Linear Refactor][2/N] Remove W8A8Fp8BlockLinearOp and adopt Fp8 block linear kernel selections. (#33892)

Signed-off-by: maral <maralbahari.98@gmail.com> Signed-off-by: Maral <maralbahari.98@gmail.com>
2026-04-09 08:50:39 +08:00
parent 8332078cfd
commit 2e9034c998
35 changed files with 1710 additions and 904 deletions
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -12,8 +12,8 @@ from tests.kernels.quant_utils import (
    native_w8a8_block_matmul,
 )
 from vllm.config import VllmConfig
+from vllm.model_executor.kernels.linear.scaled_mm.cutlass import cutlass_scaled_mm
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    cutlass_scaled_mm,
    per_token_group_quant_fp8,
    w8a8_triton_block_scaled_mm,
 )
@@ -202,7 +202,7 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):

    # only aligned sizes are supported by deepgemm
    if not should_use_deepgemm_for_fp8_linear(
-        output_dtype=out_dtype, weight=B_fp32, supports_deep_gemm=True
+        output_dtype=out_dtype, weight_shape=B_fp32.shape, supports_deep_gemm=True
    ):
        pytest.skip(f"Skipping test; invalid size {M}, {N}, {K}")