[torch.compile][ROCm] Fuse quantization onto attention using a torch.compile pass (#16756)

Signed-off-by: Luka Govedič <lgovedic@redhat.com> Co-authored-by: Sage Moore <sage@neuralmagic.com>
2025-06-12 11:31:04 -04:00
parent 96846bb360
commit f98548b9da
33 changed files with 622 additions and 79 deletions
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -547,6 +547,7 @@ class FlashInferImpl(AttentionImpl):
        kv_cache: torch.Tensor,
        attn_metadata: FlashInferMetadata,
        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        """Forward pass with FlashInfer.

@@ -561,6 +562,11 @@ class FlashInferImpl(AttentionImpl):
        """
        assert output is not None, "Output tensor must be provided."

+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for FlashInferImpl")
+
        if attn_metadata is None:
            # Profiling run.
            return output