[W8A8 Block Linear Refactor][2/N] Remove W8A8Fp8BlockLinearOp and adopt Fp8 block linear kernel selections. (#33892)

Signed-off-by: maral <maralbahari.98@gmail.com> Signed-off-by: Maral <maralbahari.98@gmail.com>
2026-04-09 08:50:39 +08:00
parent 8332078cfd
commit 2e9034c998
35 changed files with 1710 additions and 904 deletions
--- a/tests/compile/passes/test_fusion_attn.py
+++ b/tests/compile/passes/test_fusion_attn.py
@@ -66,6 +66,7 @@ class AttentionQuantPatternModel(torch.nn.Module):
        self.kv_cache_dtype = kv_cache_dtype
        self.device = device
        self.vllm_config = vllm_config
+        self.dtype = vllm_config.model_config.dtype

        self.attn = Attention(
            num_heads=self.num_qo_heads,
@@ -155,6 +156,7 @@ class TestAttentionFp8StaticQuantPatternModel(AttentionQuantPatternModel):
            activation_quant_key=self.quant_key,
            weight_quant_key=self.quant_key,
            device=self.device,
+            input_dtype=self.dtype,
        )

        w = kwargs.get("w")