[TPU][V1][Bugfix] Fix w8a8 recompiilation with GSM8K (#15714)

Signed-off-by: NickLucche <nlucches@redhat.com>
2025-03-29 05:13:06 +01:00
parent 5b800f0932
commit da461f3cbf
4 changed files with 16 additions and 15 deletions
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
@@ -97,7 +97,8 @@ class XLAScaledMMLinearKernel(ScaledMMLinearKernel):
                                             block_size=-1,
                                             int4_weight=False,
                                             quantize_activation=True)
-
+        # `quantized_matmul` output is fp32, cast it down to bf16 for perf
+        out = out.to(x.dtype)
        # Explicitly capture control flow to make dynamo happy.
        # https://pytorch.org/docs/main/generated/exportdb/index.html#cond-branch-class-method # noqa: E501
        return cond(bias is None, self.no_add_bias, self.add_bias, [out, bias])